This code was used for the analyses of Cariaco Basin metagenomes and metatranscriptomes for the 2023 publication “Diverse secondary metabolites are expressed in particle-associated and free-living microorganisms of the permanently anoxic Cariaco Basin” by Geller-McGrath et al. Some sections of these analyses were completed on the Poseidon High Performance Computing (HPC) cluster based at the Woods Hole Oceanographic Institute (WHOI). All of the analyses done here using R can be run locally; HPC processing was used to speed up computations.


Load required R libraries

library(tidyverse)
library(furrr)
library(ComplexHeatmap)
library(readxl)
library(umap)
library(cowplot)
library(dbscan)
library(ggtext) # used for ggplots
library(glue) # used for ggplots
library(cowplot) # for combining various plots together in grids
library(gridtext) # for editing text in heatmap plots
library(ComplexHeatmap) # for heatmap plots
library(pheatmap) # for heatmap plots
library(janitor)
library(ggfortify)
library(grid)
# genbankr is required but not loaded


Figure 1. Normalized biosynthetic gene cluster count per phylum

# BGC frequency plot by phylum --------------------------------------------
mag_data = read_tsv('~/Documents/cariaco-tidy-analysis/rds_to_flatfile/flatfiles/mags_75compl_5cont.tsv.gz',
                    show_col_types = FALSE) %>%
  rename(mag_name = bin_id) %>%
  mutate(mag_name = mag_name %>% str_replace('CarAnox_mtb2-', 'MAG_'))

plan(multisession, workers = 8)

filepaths = list.files(path = '~/Documents/cariaco-tidy-analysis/cariaco_github_data_small/bgc_data/all_bgcs',
                       pattern = '*.gbk',
                       full.names = TRUE)

filenames = filepaths %>% 
  str_replace('.*\\/(.*region.*)', '\\1')


## function to parse genbank files
parse_genbank = function(filepath, filename) {
  genbank = genbankr::parseGenBank(filepath)
  
  genbank = genbank$FEATURES %>%
    purrr::map_dfr(~ .x) %>%
    mutate(contig = genbank$ACCESSION,
           filename = filename,
           filepath = filepath) %>%
    relocate(c(filepath, filename, contig, product), .before = 1)
  
  return(genbank)
}


# create result and summary objects
bgc_summary = future_map2(filepaths, filenames, ~ parse_genbank(.x, .y))%>% 
  future_map_dfr(~ .x %>% 
                   filter(type == 'protocluster') %>%
                   select(product, contig, start, end,
                          core_location, detection_rule,
                          filename, filepath)) %>% 
  as_tibble() %>% 
  mutate(mag_name = contig %>% str_extract('(MAG_\\d+)'),
         bgc_size_bp = end - start) %>% #pull(bgc_size_bp) %>% {function(x){x > 0}}() %>% table()
  filter(bgc_size_bp >= 10000) %>% 
  group_by(contig) %>% 
  rename(product = product) %>% 
  mutate(is_hybrid = if_else(n() > 1, TRUE, FALSE)) %>% 
  ungroup() %>% 
  mutate(
    product_bin = case_when(
      !is_hybrid & str_detect(product, 'lanthipeptide|RRE-containing|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides') ~ 'RiPP',
      !is_hybrid & str_detect(product, 'lactone') ~ 'Lactone',
      !is_hybrid & str_detect(product, 'acyl_amino_acids|nucleoside|PBDE|indole|siderophore|resorcinol|ectoine|NAPAA|CDPS|LAP|redox-cofactor') ~ 'Other cluster*',
      !is_hybrid & str_detect(product, 'hglE-KS') ~ 'PKS',
      !is_hybrid & str_detect(product, 'arylpolyene') ~ 'Aryl polyene',
      !is_hybrid & str_detect(product, 'PKS') ~ 'PKS',
      !is_hybrid & str_detect(product, 'NRPS') ~ 'NRPS',
      !is_hybrid & str_detect(product, 'redox-cofactor') ~ 'Redox-cofactor',
      #
      is_hybrid & str_detect(product, 'NRPS') & str_detect(product, 'PKS|hglE-KS') ~ 'NRPS-PKS hybrid',
      is_hybrid & str_detect(product, 'NRPS') & !str_detect(product, 'PKS|hglE-KS|lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP') ~ 'NRPS hybrid',
      is_hybrid & str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP') & !str_detect(product, 'PKS|NRPS|hglE-KS') ~ 'RiPP hybrid',
      is_hybrid & str_detect(product, 'PKS|hglE-KS') & !str_detect(product, 'NRPS|lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP') ~ 'PKS hybrid',
      is_hybrid & str_detect(product, 'NRPS') & str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP') & !str_detect(product, 'PKS|hglE-KS') ~ 'NRPS-RiPP hybrid',
      is_hybrid & str_detect(product, 'PKS|hglE-KS') & str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP') & !str_detect(product, 'NRPS') ~ 'PKS-RiPP hybrid',
      is_hybrid & str_detect(product, 'terpene') & !str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP|PKS|NRPS|hglE-KS') ~ 'Terpene hybrid',
      is_hybrid & str_detect(product, 'ladderane') & !str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP|PKS|NRPS|hglE-KS|terpene') ~ 'Ladderane hybrid',
      is_hybrid & str_detect(product, 'arylpolyene') & !str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP|PKS|NRPS|hglE-KS|terpene') ~ 'Aryl polyene hybrid',
      is_hybrid & str_detect(product, 'lactone') & !str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP|PKS|NRPS|hglE-KS|terpene') ~ 'Lactone hybrid',
      is_hybrid & str_detect(product, 'CDPS') & !str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP|PKS|NRPS|hglE-KS|terpene') ~ 'Other hybrid',
      #
      TRUE ~ product
    )) %>% 
  group_by(contig) %>% 
  mutate(product_bin = if_else(str_detect(product_bin, '^[:lower:]'), str_to_title(product_bin), product_bin), 
         new_product = case_when(
           product_bin == 'RiPP-like' ~ 'RiPP',
           product_bin %in% c('Prodigiosin','Redox-Cofactor','LAP','PUFA','Resorcinol','Other') ~ 'Other cluster*',
           str_detect(product_bin, 'hybrid') ~ 'Hybrid cluster',
           TRUE ~ product_bin
         )) %>% 
  relocate(new_product) %>% #pull(new_product) %>% table() %>% sort()
  group_by(contig) %>% 
  mutate(new_product = case_when(n() > 1 ~ 'Hybrid cluster', TRUE ~ new_product)) %>% 
  ungroup()

bgc_summary %>% 
  select(new_product, mag_name, contig) %>% 
  distinct() %>% 
  pull(new_product) %>% 
  table() %>% 
  sort()
## .
##    Phosphonate      Ladderane   Aryl polyene Other cluster*        Lactone 
##             29             51             57             57             96 
##            PKS           NRPS Hybrid cluster        Terpene           RiPP 
##            113            115            133            196            335
   # Phosphonate      Ladderane   Aryl polyene Other cluster*        Lactone            PKS           NRPS Hybrid cluster        Terpene           RiPP 
   #          29             51             57             57             96            113            115            133            196            335 


bgc_count_data_summary = read_tsv('~/Documents/cariaco-tidy-analysis/rds_to_flatfile/flatfiles/bgc_count_data_summary.tsv.gz')
## Rows: 181 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): phylum, domain, product
## dbl (4): n_product_in_phylum, n_mags_in_phylum, norm_sm_freq, total_sm_freq
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
smc = c("Aryl polyene" = "#7E6EA2", 
        "Lactone" = "#cd5c5c", 
        "NRPS" = "#A0A811", 
        "PKS" = "#B78415", 
        "RiPP" = "#BD6332", 
        "Hybrid cluster" = "#B3499C", 
        "Ladderane" = "#E0A604", 
        "Other cluster*" = "#1B9E77", 
        "Phosphonate" = "#B294C7", 
        "Terpene" = "#7D8F31")

#smc = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/smc.rds')


highlight <- function(x, pat, color = "black", family = "") {
  ifelse(grepl(pat, x), glue("<b style='font-family:{family}; color:{color}'>{x}</b>"), x)
}

face2 = bgc_count_data_summary %>%
  select(phylum, domain, n_mags_in_phylum) %>%
  distinct() %>%
  filter(n_mags_in_phylum == 1) %>%
  mutate(type = 'bold')


bact_final_sm_freq_plot = bgc_count_data_summary %>%
  filter(domain == 'Bacteria') %>%
  ggplot(aes(as_factor(phylum), norm_sm_freq,
             fill = fct_rev(fct_infreq(product)))) +
  geom_bar(stat = 'identity') +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 70, hjust = 1, size = 10),
        axis.text.y = element_text(size = 10),
        axis.title = element_text(size = 12),
        legend.position = c(0.98, 0.98),
        legend.justification = c(0.98, 0.98),
        legend.text = element_text(size = 11),
        legend.title = element_text(size = 12),
        legend.background = element_blank(), panel.border =
          element_rect(color = 'black', fill = NA),
        legend.box.background = element_rect(color = 'black'),
        panel.grid = element_blank(),
        strip.text = element_text(size = 13),
        axis.title.y = element_blank(),
        strip.background = element_rect(fill = 'wheat1')
  ) +
  labs(x = '\nPhylum',
       fill = 'Secondary metabolite class') +
  scale_fill_manual(values = smc) +
  facet_grid(~ domain, space = 'free', scales = 'free') +
  scale_x_discrete(labels = function(x) highlight(x, paste0(face2$phylum, collapse = '|'), 'black')) +
  theme(axis.text.x = element_markdown())# +


arch_final_sm_freq_plot = bgc_count_data_summary %>%
  filter(domain == 'Archaea') %>%
  ggplot(aes(as_factor(phylum), norm_sm_freq,
             fill = fct_rev(fct_infreq(product)))) +
  geom_bar(stat = 'identity') +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 70, hjust = 1),
        axis.text = element_text(size = 10),
        axis.title = element_text(size = 12),
        legend.position = 'none',
        panel.grid = element_blank(),
        strip.text = element_text(size = 13),
        axis.title.y = element_blank(),
        strip.background = element_rect(fill = 'wheat1')
  ) +
  labs(x = '\nPhylum',
       fill = 'Secondary metabolite class') +
  scale_fill_manual(values = smc) +
  facet_grid(~ domain, space = 'free', scales = 'free') +
  scale_x_discrete(labels = function(x) highlight(x, paste0(face2$phylum, collapse = '|'), 'black')) +
  theme(axis.text.x = element_markdown())


# save the sm biosynthetic potential plot
sm_freq_plot = plot_grid(bact_final_sm_freq_plot, NULL, arch_final_sm_freq_plot,
                         labels = c('a', '', 'b'),
                         rel_widths = c(3, 0.02, 1),
                         nrow = 1,
                         align = 'hv'#,
                         #label_x = c(0.0975, 0, 0)
)
sm_freq_plot


Figure 2. Expression of secondary metabolite biosynthetic transcripts from individual MAGs in metatranscriptomic samples

mm2_final = read_tsv('~/Documents/cariaco-tidy-analysis/rds_to_flatfile/flatfiles/logTransformed_tpm_for_bgc_genes_minimap2_res.tsv.gz')
## Rows: 23843 Columns: 49
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (1): gene_name
## dbl (48): MFL103_1, MFL103_2, MFL198_1, MFL198_2, MFL234_1, MFL234_2, MFL295...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
metaT_col_order = tibble(
  sample_name = colnames(mm2_final)[2:ncol(mm2_final)],
  fraction = if_else(
    str_detect(sample_name, 'PA'), 'PA', 'FL'),
  depth = str_replace(
    sample_name, '.*(\\d{3}).*', '\\1')
) %>%
  arrange(desc(fraction), depth) %>%
  pull(sample_name)


# create color specifications for column annotations
anno_colors = list(
  Fraction = c(
    'PA' = 'darkgreen',
    'FL' = 'darkseagreen2'),
  Layer = c(
    'Oxycline' = 'darkslategray3', #'wheat2',
    'Shallow anoxic' = 'gold',# 'khaki2',
    'Euxinic' =  'sienna3'),

  Season = c('May' = 'slategray',
             'November' = 'slategray1'),

  `Depth (m)` = c(
    '103' = 'grey90',
    '148' = 'grey85',
    '198' = 'grey80',
    '200' = 'grey75',
    '234' = 'grey70',
    '237' = 'grey65',
    '247' = 'grey60',
    '267' = 'grey55',
    '295' = 'grey50',
    '314' = 'grey45',
    '900' = 'grey20'
  )
)


# final plot used in manuscript
mm2_final_mod =
  mm2_final %>%
  mutate(group_col = str_replace(
    gene_name, '(MAG_\\d+-ctg\\d+)_.*', '\\1'), .before = 1) %>%
  relocate(group_col) %>%
  mutate(rowSums = rowSums(.[3:ncol(.)]), .after = 1) %>%
  mutate(rs_pa = rowSums(.[str_detect(colnames(.), 'PA')]),
         rs_fl = rowSums(.[str_detect(colnames(.), 'FL')])) %>%
  relocate(c(rs_pa, rs_fl), .after = 1) %>%
  filter(
    rowSums > 19
  ) %>%
  group_by(group_col) %>%
  mutate(group_PA_rowSums = sum(rs_pa), .after = 1) %>%
  mutate(group_FL_rowSums = sum(rs_fl), .after = 1) %>%
  mutate(sorting_col = if_else(
    group_PA_rowSums > group_FL_rowSums, 'PA', 'FL'), .after = group_PA_rowSums) %>%
  ungroup() %>%
  arrange(desc(group_PA_rowSums), sorting_col) %>%
  select(-c(rowSums, group_PA_rowSums, group_FL_rowSums,
            rs_pa, rs_fl, group_col, sorting_col)) %>%
  column_to_rownames(var = 'gene_name') %>%
  relocate(all_of(metaT_col_order))


mm2_final_mod =
  mm2_final_mod %>%

  mutate(pa_sums = rowSums(.[str_detect(colnames(.), 'PA')]),
         fl_sums = rowSums(.[str_detect(colnames(.), 'FL')]) ) %>%
  relocate(c(pa_sums, fl_sums), .before = 1) %>%
  mutate(row_sorter = if_else(pa_sums > fl_sums, TRUE, FALSE), .before = 1) %>%
  arrange(desc(row_sorter), desc(pa_sums)) %>%
  select(-c(pa_sums, fl_sums, row_sorter)) %>%

  relocate(colnames(.) %>%
             {function(x) {
               tbl = tibble(
                 mpa = x[str_detect(x, 'MPA')],
                 npa = x[str_detect(x, 'NPA')],
                 mfl = x[str_detect(x, 'MFL')],
                 nfl = x[str_detect(x, 'NFL')]
               ) %>%
                 mutate(groupper = rep(paste0('group_', seq(1, 6, by = 1)), each = 2)) %>%
                 group_by(groupper) %>%
                 summarize(across(everything(), ~
                                    paste0(.x, collapse = ';')),
                           .groups = 'drop') %>%
                 select(-groupper)

               pa = as.vector(rbind(tbl$mpa, tbl$npa))
               fl = as.vector(rbind(tbl$mfl, tbl$nfl))

               g = c(pa, fl)
               g = str_split(g, pattern = ';') %>% unlist()

               return(g)}}())

metaT_row_anno = tibble(
  gene_name = rownames(mm2_final_mod),
  mag_name = str_replace(gene_name, '(MAG_\\d+)-.*', '\\1')
) %>%
  column_to_rownames(var = 'gene_name')


# create column annotation information object
col_anno = tibble(
  sample_name = colnames(mm2_final_mod),
  Depth = str_replace(sample_name, '.*(\\d{3}).*', '\\1'),
  Fraction = if_else(str_detect(sample_name, 'PA'), 'PA', 'FL'),
  Season = if_else(str_detect(sample_name, 'M'), 'May', 'November')
) %>%
  mutate(Depth = as.integer(Depth),
         Layer = case_when(
           Depth < 247 ~ 'Oxycline',
           Depth >= 247 & Depth < 900 ~ 'Shallow anoxic',
           TRUE ~ 'Euxinic'
         )) %>%
  mutate(Depth = as.character(Depth)) %>%
  rename(`Depth (m)` = Depth) %>%
  column_to_rownames(var = 'sample_name')


mm2_final_plot =
  mm2_final_mod %>%
  rownames_to_column(var = 'gene_name') %>%
  mutate(mag_name = str_replace(
    gene_name, '(MAG_\\d+)-.*', '\\1')) %>%
  filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983')) %>%
  select(-mag_name) %>%
  column_to_rownames(var = 'gene_name') %>%
  as.matrix() %>%

  ComplexHeatmap::pheatmap(
    name = 'Transcripts per million',
    show_colnames = FALSE,
    annotation_col = col_anno,
    #annotation_row = metaT_row_anno,
    annotation_colors = anno_colors,
    gaps_col = 24,
    clustering_method = 'mcquitty',
    #cluster_rows = FALSE,
    cluster_cols = FALSE,
    show_rownames = FALSE,
    border_color = 'grey60',
    treeheight_row = 0,
    treeheight_col = 0,
    color =
      c(#colorRampPalette(colors = 'white')(100),
        colorRampPalette(colors = c(
          'gray98',
          '#aae6fa',
          '#c9e6f0',
          '#f7e96d',
          '#f7e43b',
          'goldenrod1',
          'orange',
          '#ff6a00',
          '#D73027',
          'purple4'))(1000)
      ),
    legend_breaks = seq(0, 6, by = 2),
    legend_labels = gt_render(c(
      '0',
      '6.39 x 10^0',
      '5.36 x 10^1',
      '4.02 x 10^2')),
    show_row_dend = FALSE,
    show_column_dend = FALSE
  )



mm2_final_plot = ComplexHeatmap::draw(
  mm2_final_plot,
  merge_legend = TRUE,
  heatmap_legend_side = 'right',
  annotation_legend_side = 'right'
)


Figure 3. UMAP analysis for read mapping data of particle-associated and free-living metagenomes (a) and metatranscriptomes (b) to all BGCs longer than 10kb total length detected in Cariaco MAGs

# Metatranscriptome UMAP analysis ----------------------------------------------------
setwd('~/Documents/cariaco-tidy-analysis/')
rnaseq_mapping_counts = read_tsv('~/Documents/cariaco-tidy-analysis/rds_to_flatfile/flatfiles/rnaseq_read_mapping_minimap2_on_cariaco_genes.tsv.gz')
## Rows: 1646447 Columns: 49
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (1): gene_name
## dbl (48): MFL103_1, MFL103_2, MFL198_1, MFL198_2, MFL234_1, MFL234_2, MFL295...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
bgc_genes_10kb = read_tsv('~/Documents/cariaco-tidy-analysis/rds_to_flatfile/flatfiles/bgc_genes_10kb.tsv.gz')
## Rows: 23146 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): mag_name, gene_name, contig, product
## dbl (2): region, cluster_kb
## lgl (1): contig_edge
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
mm2_final = read_tsv('~/Documents/cariaco-tidy-analysis/rds_to_flatfile/flatfiles/logTransformed_tpm_for_bgc_genes_minimap2_res.tsv.gz')
## Rows: 23843 Columns: 49
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (1): gene_name
## dbl (48): MFL103_1, MFL103_2, MFL198_1, MFL198_2, MFL234_1, MFL234_2, MFL295...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
col_metadata = read_tsv('~/Documents/cariaco-tidy-analysis/rds_to_flatfile/flatfiles/col_metadata.tsv.gz')
## Rows: 47 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): column, season, fraction, layer
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
rnaseq_umap_tbl = mm2_final |>
  filter(gene_name %in% bgc_genes_10kb$gene_name) |>
  mutate(contig = str_replace(gene_name, '(MAG_\\d+)-ctg(.*)_\\d+', '\\1_\\2'),
         mag_name = str_replace(contig, '(MAG_\\d+)_.*', '\\1')) |>
  relocate(c(mag_name, contig), .before = 1) |>
  select(-gene_name) |>
  group_by(mag_name, contig) |>
  summarize(across(everything(), ~ sum(.x))) |>
  filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983'))
## `summarise()` has grouped output by 'mag_name'. You can override using the
## `.groups` argument.
# need the environmental metadata
cariaco_env_sheets = excel_sheets('~/Documents/cariaco-tidy-analysis/stables/Cariaco_Supplementary_files_2022.xlsx')

cariaco_env_vars = read_excel('~/Documents/cariaco-tidy-analysis/stables/Cariaco_Supplementary_files_2022.xlsx',
                              sheet = cariaco_env_sheets[6])

cariaco_env_vars = cariaco_env_vars |>
  filter(sample_type == 'metagenome') |>
  select(-c(ncbi_metagenome_name, ncbi_metatranscriptome_name,
            replicate, sample_type, year))

cariaco_env_vars = cariaco_env_vars |>
  mutate(across(where(is.character), ~ if_else(.x == 'NA', NA_character_, .x))) |>
    select(-c(NO3, H2S))

res_umap2 =
  rnaseq_umap_tbl |>
    ungroup() |>
    select(-mag_name) |>
    pivot_longer(
      2:last_col(),
      names_to = 'sample',
      values_to = 'count') |>
    pivot_wider(
      names_from = contig,
      values_from = count) |>
    column_to_rownames('sample') |>
    select(-where(~ sum(.x) == 0)) |>
    as.matrix() |>
    umap()


rna_hdbscan_res = hdbscan(res_umap2$layout, minPts = 5)
rna_hdbscan_res$cluster
##  [1] 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 1 1
## [39] 1 1 1 1 1 1 1 1 1 1
umap_df2 <- res_umap2$layout |>
  as.data.frame() |>
  mutate(cluster = rna_hdbscan_res$cluster) |>
  rename(UMAP1="V1",
         UMAP2="V2") |>
  rownames_to_column('sample') |>
  as_tibble() |>
  left_join(cariaco_env_vars, by = c('sample' = 'sample_name_used_for_this_study')) |>
  filter(!is.na(depth))



# Metagenome UMAP analysis ----------------------------------------------------
dnaseq_umap_tbl = read_tsv('~/Documents/cariaco-tidy-analysis/rds_to_flatfile/flatfiles/metaG_processed_mapping_tbl.tsv.gz')
## Rows: 1646447 Columns: 48
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (1): gene_name
## dbl (47): D1b900AM, D1b900BM, D2a143A, D2a143B, D2a200A, D2a200B, D2a237A, D...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dnaseq_umap_tbl = dnaseq_umap_tbl |>  #rnaseq_mapping_counts |>
  filter(gene_name %in% bgc_genes_10kb$gene_name) |>
  mutate(contig = str_replace(gene_name, '(MAG_\\d+)-ctg(.*)_\\d+', '\\1_\\2'),
         mag_name = str_replace(contig, '(MAG_\\d+)_.*', '\\1')) |>
  relocate(c(mag_name, contig), .before = 1) |>
  select(-gene_name) |>
  group_by(mag_name, contig) |>
  summarize(across(everything(), ~ mean(.x))) |>
  filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983'))
## `summarise()` has grouped output by 'mag_name'. You can override using the
## `.groups` argument.
dnaseq_res_umap =
  dnaseq_umap_tbl |>
  ungroup() |>
  select(-mag_name) |>
  #column_to_rownames('contig') |>
  pivot_longer(
    2:last_col(),
    names_to = 'sample',
    values_to = 'count') |>
  pivot_wider(
    names_from = contig,
    values_from = count) |>

  left_join(col_metadata, by = c('sample' = 'column')) |>
  mutate(depth = str_replace(sample, '.*(\\d{3}).*', '\\1'),
         replicate = if_else(str_detect(sample, 'a'), 1, 2),
         new_colname = paste0(season, fraction, depth, '_', replicate)) |>
  select(-c(depth, replicate, season, fraction, sample, layer)) |>
  relocate(new_colname) |>
  column_to_rownames('new_colname') |>
  select(-where(~ sum(.x) == 0)) |>
  as.matrix() |>
  umap()

hdbscan_res_dna = hdbscan(dnaseq_res_umap$layout, minPts = 5)
hdbscan_res_dna$cluster
##  [1] 4 0 0 1 2 1 2 3 2 3 4 3 4 4 1 2 1 2 0 2 3 4 3 4 0 0 1 2 1 2 1 4 3 4 4 4 0 0
## [39] 0 2 1 2 1 4 3 4 0
dnaseq_umap_df <- dnaseq_res_umap$layout |>
  as.data.frame() |>
  mutate(cluster = hdbscan_res_dna$cluster) |>
  rename(UMAP1="V1",
         UMAP2="V2") |>
  rownames_to_column('sample') |>
  as_tibble() |>
  left_join(cariaco_env_vars, by = c('sample' = 'sample_name_used_for_this_study')) |>
  filter(!is.na(depth))



# create final joint umap plot - plotting code pasted here again ----------


#metaT plot
metaT_umap_result = umap_df2 |>
  mutate(
    size_fraction = if_else(
      size_fraction == 'free-living (0.2-2.7 um)',
      'Free-living (0.2-2.7 um)', 'Particle-associated (>2.7 um)'),

    redox_regime =
      str_to_title(redox_regime) |>
      paste0(' - ', size_fraction)
  ) |>
  ggplot(aes(x = UMAP1,
             y = UMAP2,
             color = time_point,
             shape = redox_regime
  )) +
  geom_point(size = 4.5) +
  labs(x = "\nUMAP1",
       y = "UMAP2\n",
       color = 'Time point',
       shape = 'Redox regime/size fraction') +
  theme_bw() +
  scale_shape_manual(
    values = c(0,15,1,16,2,17)) +
  scale_color_manual(values = c(
    'May' = '#009E73',
    'November' = '#CC79A7')
  ) +
  theme(
    axis.title = element_text(size = 13),
    axis.text = element_text(size = 12),
    legend.text = element_text(size = 13),
    legend.title = element_text(size = 13),
    panel.grid = element_blank(),
    legend.position = 'top') +
  guides(shape = 'none')



# metaG plot
metaG_umap_result = dnaseq_umap_df |>
  mutate(
    size_fraction = if_else(
      size_fraction == 'free-living (0.2-2.7 um)',
      'Free-living (0.2-2.7 um)', 'Particle-associated (>2.7 um)'),

    redox_regime =
      str_to_title(redox_regime) |>
      paste0(' - ', size_fraction)
  ) |>
  ggplot(aes(x = UMAP1,
             y = UMAP2,
             color = time_point,
             shape = redox_regime
  )) +
  geom_point(size = 4.5) +
  labs(x = "\nUMAP1",
       y = "UMAP2\n",
       color = 'Time point',
       shape = 'Redox regime/size fraction'
  ) +
  theme_bw() +
  scale_shape_manual(
    values = c(0,15,1,16,2,17)) +
  scale_color_manual(values = c(
    'May' = '#E69F00',
    'November' = '#56B4E9')
  ) +
  theme(
    axis.title = element_text(size = 13),
    axis.text = element_text(size = 12),
    legend.text = element_text(size = 13),
    legend.title = element_text(size = 13),
    panel.grid = element_blank(),
    legend.position = 'top') +
  guides(shape = 'none')


# just the metaT shape legend
metaT_shape_legend = umap_df2 |>
  mutate(
    size_fraction = if_else(
      size_fraction == 'free-living (0.2-2.7 um)',
      'Free-living (0.2-2.7 um)', 'Particle-associated (>2.7 um)'),

    redox_regime =
      str_to_title(redox_regime) |>
      paste0(' - ', size_fraction)
  ) |>
  ggplot(aes(x = UMAP1,
             y = UMAP2,
             color = time_point,
             shape = redox_regime
  )) +
  geom_point(size = 4.5) +
  labs(x = "\nUMAP1",
       y = "UMAP2\n",
       color = 'Time point',
       shape = 'Redox regime/size fraction') +
  theme_bw() +
  scale_shape_manual(
    values = c(0,15,1,16,2,17)) +
  scale_color_manual(values = c(
    'May' = '#009E73',
    'November' = '#CC79A7')
  ) +
  theme(
    axis.title = element_text(size = 13),
    axis.text = element_text(size = 12),
    legend.text = element_text(size = 13),
    legend.title = element_text(size = 13),
    panel.grid = element_blank()) +
  guides(color = 'none')

metaT_shape_legend = get_legend(metaT_shape_legend)



umap_final_joint_plot = plot_grid(
  metaG_umap_result, NULL, metaT_umap_result, metaT_shape_legend,
  nrow = 1,
  rel_widths = c(1, 0.05, 1, 1),
  labels = c('a', NA, 'b')
)
umap_final_joint_plot
## Warning: Removed 1 rows containing missing values (`geom_text()`).


Figure 4. Distribution of core and additional biosynthetic genes or domains and transcripts from biosynthetic gene clusters

smc_palette_names = c(
  'Aryl polyene',
  'Phosphonate',
  'Polyketide',
  'Other cluster*',
  'RiPP',
  'Terpene',
  'NRPS',
  'Hybrid cluster',
  'Lactone',
  'Ladderane'
)

smc_palette = c("#7E6EA2","#B294C7","#B78415","#1B9E77","#BD6332",
                 "#7D8F31","#A0A811","#B3499C","#CD5555","#E0A604") %>% 
  set_names(nm = smc_palette_names)

final_domain_summary = read_tsv('~/Documents/cariaco-tidy-analysis/rds_to_flatfile/flatfiles//final_domain_summary.tsv.gz')
## Rows: 36976 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): mag_name, gene_name, contig, description, detailed_product, product
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
final_de_and_clust_data = read_tsv('~/Documents/cariaco-tidy-analysis/rds_to_flatfile/flatfiles/final_de_and_clust_data.tsv.gz')
## Rows: 5165 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (4): gene_name, mag_name, contig, product
## dbl (13): fl_counts, pa_counts, padj, baseMean, log2FoldChange, lfcSE, stat,...
## lgl  (4): is_sig_pa_biosynthetic, is_sig_fl_biosynthetic, is_biosynthetic, c...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# create plot showing the most frequently occurring core/auxiliary biosynthetic genes in the Cariao MAGs
all_common_domains_plot =
  final_domain_summary %>%
  group_by(description) %>%
  add_tally() %>%
  ungroup() %>%
  filter(!is.na(product)) %>%
  mutate(product = if_else(product == 'PKS', 'Polyketide', product)) %>% 
  mutate(description = case_when(
    str_detect(description,
               regex('Histidine kinase-', ignore_case = TRUE)) ~ 'ATPase',
    str_detect(description, regex('ketoacyl synthase', ignore_case = TRUE)) ~ 'Beta-ketoacyl synthase domain',
    str_detect(description, 'RRE') ~ 'RiPP Recognition Element protein',
    str_detect(description, 'Coenzyme PQQ synthesis') ~ 'PqqD-like domain',
    str_detect(description, 'Radical SAM') ~ 'Radical SAM domain',
    str_detect(description, 'Glycosyl transferase') ~ 'Glycosyl transferase',
    str_detect(description, 'AMP-binding') ~ 'AMP-binding domain',
    str_detect(description, 'Squalene') ~ 'Squalene-hopene cyclase domain',
    str_detect(description, 'Acyl-CoA dehydrogenase') ~ 'Acyl-CoA dehydrogenase domain',
    str_detect(description, regex(
      'FAD dependent oxidoreductase|FMN-dependent oxidoreductase', ignore_case = TRUE)) ~
      'FAD/FMN-dependent oxidoreductase',
    str_detect(description, 'FAD binding|FAD-binding|FMN binding') ~ 'FAD/FMN binding domain',

    TRUE ~ description)) %>% #view()
  filter(n > 0.2 * max(n) |
           description == 'FAD/FMN-dependent oxidoreductase'|
           description == 'FAD/FMN binding domain') %>% #pull(description) %>% unique()
  mutate(group = 'Core and additional biosynthetic genes') %>% #pull(description) %>% tabyl() %>% view()
  select(c(description, product, group)) %>%
  group_by(description, product, group) %>%
  mutate(number = n()) %>%
  ungroup() %>%
  group_by(description) %>%
  add_tally() %>%
  ungroup() %>%
  distinct() %>%
  arrange(desc(n)) %>%
  filter(description != 'ABC Transporter') %>%

  ggplot(aes(
    y = fct_inorder(description),
    x = number,
    fill = fct_rev(fct_infreq(factor(product, exclude = NULL))))) +
  geom_bar(stat = 'identity', position = 'stack') +
  theme_bw() +
  facet_wrap(~ group) +
  theme_bw() +
  theme(
    strip.text = element_text(size = 11),
    axis.text = element_text(size = 9),
    axis.title.y = element_blank(),
    strip.background = element_rect(fill = "wheat1"),
    panel.grid = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = c(0.9875, 0.98),
    legend.justification = c(1, 0.99),
    legend.background = element_blank(),
    panel.border = element_rect(color = 'black', fill = NA),
    legend.box.background = element_rect(color = 'black'),
    legend.title = element_text(size = 11),
    legend.text = element_text(size = 10)
    #legend.position = 'none'
  ) +
  scale_fill_manual(values = c(smc_palette), na.value = 'white') +
  labs(
    fill = 'Secondary metabolite class',
    y = 'Antibiotic resistance type',
    x = '\nTotal genes\n'
  ) +
  guides(fill = guide_legend(ncol = 2))




# # create plot showing the most frequently occurring core/auxiliary biosynthetic transcripts in the PA metatranscriptomes
pa_domains_plot =
  final_de_and_clust_data %>%
  filter(gene_name %in% final_domain_summary$gene_name) %>%
  filter(is_sig_pa_biosynthetic | (pa_counts > 0 & fl_counts == 0)) %>%
  rename(raw_product = product) %>%
  left_join(final_domain_summary %>%
              select(c(mag_name, contig, gene_name, product, description)),
            by = c('contig', 'mag_name', 'gene_name')) %>%
  mutate(product = case_when(
    str_detect(product, regex('hybrid', ignore_case = FALSE)) ~ 'Hybrid cluster',
    product == 'Other' ~ 'Other cluster*',
    product == 'PKS' ~ 'Polyketide',
    TRUE ~ product)) %>%
  select(-n) %>%
  filter(!is.na(product)) %>%
  mutate(description = case_when(
    str_detect(description,
               regex('Histidine kinase-', ignore_case = TRUE)) ~ 'ATPase',
    str_detect(description, regex('ketoacyl synthase', ignore_case = TRUE)) ~ 'Beta-ketoacyl synthase domain',
    str_detect(description, 'RRE') ~ 'RiPP Recognition Element protein',
    str_detect(description, 'Coenzyme PQQ synthesis') ~ 'PqqD-like domain',
    str_detect(description, 'Radical SAM') ~ 'Radical SAM domain',
    str_detect(description, 'Glycosyl transferase') ~ 'Glycosyl transferase',
    str_detect(description, 'AMP-binding') ~ 'AMP-binding domain',
    str_detect(description, 'Squalene') ~ 'Squalene-hopene cyclase domain',
    str_detect(description, 'Acyl-CoA dehydrogenase') ~ 'Acyl-CoA dehydrogenase domain',
    str_detect(description, regex(
      'FAD dependent oxidoreductase|FMN-dependent oxidoreductase', ignore_case = TRUE)) ~
      'FAD/FMN-dependent oxidoreductase',
    str_detect(description, 'FAD binding|FAD-binding|FMN binding') ~ 'FAD/FMN binding domain',

    TRUE ~ description)) %>%
  filter(description %in% all_common_domains_plot$data$description) %>%
  mutate(group = 'Particle-associated') %>% #pull(description) %>% tabyl() %>% view()
  select(c(description, product, group)) %>%
  group_by(description, product, group) %>%
  mutate(number = n()) %>%
  ungroup() %>%
  group_by(description) %>%
  add_tally() %>%
  ungroup() %>%
  distinct() %>%
  arrange(desc(n)) %>%
  filter(description != 'ABC Transporter') %>%

  ggplot(aes(
    y = factor(description,
               levels = unique(all_common_domains_plot$data$description)),
    x = number,
    fill = fct_rev(fct_infreq(factor(product, exclude = NULL))))) +
  geom_bar(stat = 'identity', position = 'stack') +
  theme_bw() +
  facet_wrap(~ group) +
  theme_bw() +
  theme(
    strip.text = element_text(size = 11),
    axis.text.x = element_text(size = 9),
    axis.text.y = element_blank(),
    axis.title.y = element_blank(),
    strip.background = element_rect(fill = "wheat1"),
    panel.grid = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = 'none') +
  scale_fill_manual(values = c(smc_palette), na.value = 'white') +
  labs(
    fill = 'Secondary metabolite class',
    y = 'Antibiotic resistance type',
    x = '\nTotal transcripts') +
  scale_x_continuous(limits = c(0, 200))




# # create plot showing the most frequently occurring core/auxiliary biosynthetic transcripts in the FL metatranscriptomes
fl_domains_plot =
  final_de_and_clust_data %>%
  filter(gene_name %in% final_domain_summary$gene_name) %>%
  filter(is_sig_fl_biosynthetic | (fl_counts > 0 & pa_counts == 0)) %>%
  rename(raw_product = product) %>%
  left_join(final_domain_summary %>%
              select(c(mag_name, contig, gene_name, product, description)),
            by = c('contig', 'mag_name', 'gene_name')) %>%
  mutate(product = case_when(
    str_detect(product, regex('hybrid', ignore_case = FALSE)) ~ 'Hybrid cluster',
    product == 'Other' ~ 'Other cluster*',
    product == 'PKS' ~ 'Polyketide',
    TRUE ~ product)) %>%
  select(-n) %>%
  filter(!is.na(product)) %>%
  mutate(description = case_when(
    str_detect(description,
               regex('Histidine kinase-', ignore_case = TRUE)) ~ 'ATPase',
    str_detect(description, regex('ketoacyl synthase', ignore_case = TRUE)) ~ 'Beta-ketoacyl synthase domain',
    str_detect(description, 'RRE') ~ 'RiPP Recognition Element protein',
    str_detect(description, 'Coenzyme PQQ synthesis') ~ 'PqqD-like domain',
    str_detect(description, 'Radical SAM') ~ 'Radical SAM domain',
    str_detect(description, 'Glycosyl transferase') ~ 'Glycosyl transferase',
    str_detect(description, 'AMP-binding') ~ 'AMP-binding domain',
    str_detect(description, 'Squalene') ~ 'Squalene-hopene cyclase domain',
    str_detect(description, 'Acyl-CoA dehydrogenase') ~ 'Acyl-CoA dehydrogenase domain',
    str_detect(description, regex(
      'FAD dependent oxidoreductase|FMN-dependent oxidoreductase', ignore_case = TRUE)) ~
      'FAD/FMN-dependent oxidoreductase',
    str_detect(description, 'FAD binding|FAD-binding|FMN binding') ~ 'FAD/FMN binding domain',

    TRUE ~ description)) %>%
  filter(description %in% all_common_domains_plot$data$description) %>%
  mutate(group = 'Free-living') %>% #pull(description) %>% tabyl() %>% view()
  select(c(description, product, group)) %>%
  group_by(description, product, group) %>%
  mutate(number = n()) %>%
  ungroup() %>%
  group_by(description) %>%
  add_tally() %>%
  ungroup() %>%
  distinct() %>%
  arrange(desc(n)) %>%
  filter(description != 'ABC Transporter') %>%

  ggplot(aes(
    y = factor(description,
               levels = unique(all_common_domains_plot$data$description)),
    x = number,
    fill = fct_rev(fct_infreq(factor(product, exclude = NULL))))) +
  geom_bar(stat = 'identity', position = 'stack') +
  theme_bw() +
  facet_wrap(~ group) +
  theme_bw() +
  theme(
    strip.text = element_text(size = 11),
    axis.text = element_text(size = 9),
    axis.title.y = element_blank(),
    strip.background = element_rect(fill = "wheat1"),
    panel.grid = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = 'none') +
  scale_fill_manual(values = c(smc_palette), na.value = 'white') +
  labs(
    fill = 'Secondary metabolite class',
    y = 'Antibiotic resistance type',
    x = '\nTotal transcripts') +
  scale_x_reverse(limits = c(200, 0))



# combine the PA and FL transcript expression bar plots
final_domains_plotgrid = plot_grid(
  fl_domains_plot,
  pa_domains_plot,
  nrow = 1,
  rel_widths = c(0.65, 0.35)
)


# combine the expression barplots with the genomic potential bar plot to create the final plot
FINAL_ALL_DOMAINS_PLOT =
  plot_grid(all_common_domains_plot,
            final_domains_plotgrid,
            ncol = 1,
            labels = 'auto',
            rel_heights = c(0.51, 0.49))

FINAL_ALL_DOMAINS_PLOT


Supplementary Figure 1. Frequency of Cariaco prokaryotic MAGs (≥75% completeness,

≤5% contamination) by bacterial (a) and archaeal phylum (b)

# Distribution of MAG phyla recovered and average genome completeness per phylum --------------------------------
sm_key = read_tsv('~/Documents/cariaco-tidy-analysis/rds_to_flatfile/flatfiles/sm_key.tsv.gz')
## Rows: 21082 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): mag_name, cluster, sm_class, gene_name
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
mag_table = read_tsv('~/Documents/cariaco-tidy-analysis/rds_to_flatfile/flatfiles/mag_table.tsv.gz')
## Rows: 565 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): mag_name, tax_num, domain, phylum, class, order, family, genus, spe...
## dbl (3): completeness, contamination, strain_het
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
heat_colors <- c(colorRampPalette(colors = 'white')(1),
                 colorRampPalette(colors = c('#ABD9E9',
                                             '#FEE090',
                                             'orange',
                                             '#D73027'))(8000))

sm_summary <- sm_key %>%
  select(mag_name, cluster, sm_class) %>%
  distinct() %>%
  left_join(mag_table, by = 'mag_name')

# plot used
mag_dist_plot_archaea <- mag_table %>%
  group_by(phylum) %>%
  summarize(freq = n()) %>%
  arrange(desc(freq)) %>%
  mutate(x = row_number(),
         phylum) %>%
  left_join(mag_table %>%
              select(phylum, domain, completeness) %>%
              group_by(phylum) %>%
              add_tally(mean(completeness), name = 'average_completeness') %>%
              select(-completeness) %>%
              ungroup() %>%
              distinct(),
            by = 'phylum') %>%
  mutate(phylum = if_else(is.na(phylum), 'Unclassified', phylum)) %>%
  mutate(phylum = as_factor(phylum)) %>%
  filter(domain == 'Archaea') %>%
  ggplot(aes(phylum, freq)) +
  theme_bw() +
  theme(panel.grid = element_blank(),
        axis.text.x = element_text(angle = 65, hjust = 1, size = 8),
        axis.title.x = element_text(size = 10),
        axis.title.y = element_blank(),
        strip.text = element_text(size = 10),
        strip.background = element_rect(fill = 'wheat1'),
        axis.text.y = element_text(size = 8),
        legend.position = 'none'
  ) +
  geom_segment(aes(xend = phylum,  yend = 0), color = 'grey50') +
  geom_point(size = 1.5,
             aes(color = average_completeness)) +
  geom_text(aes(label = freq, vjust = -0.65), size = 2.5) +
  facet_grid(~ domain, scales = 'free', space = 'free') +
  labs(x = 'Phylum',
       y = 'Number of MAGs per phylum\n',
       color = 'Average genome
completeness per phylum',

  ) +
  scale_color_gradientn(colors = heat_colors,
                        breaks = seq(75, 100, by = 5),
                        limits = c(79, 100)
  ) +
  scale_y_continuous(limits = c(0, 75),
                     expand = expansion(add = c(1, 0)))


mag_dist_plot_bacteria =
  mag_table %>%
  filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983')) %>%
  group_by(phylum) %>%
  summarize(freq = n()) %>%
  arrange(desc(freq)) %>%
  mutate(x = row_number(),
         phylum) %>%
  left_join(mag_table %>%
              select(phylum, domain, completeness) %>%
              group_by(phylum) %>%
              add_tally(mean(completeness), name = 'average_completeness') %>%
              select(-completeness) %>%
              ungroup() %>%
              distinct(),
            by = 'phylum') %>%
  mutate(phylum = if_else(is.na(phylum), 'Unclassified', phylum)) %>%
  mutate(phylum = as_factor(phylum)) %>%
  filter(domain == 'Bacteria') %>%
  ggplot(aes(phylum, freq)) +
  theme_bw() +
  theme(panel.grid = element_blank(),
        axis.text.x = element_text(angle = 65, hjust = 1, size = 8),
        axis.title = element_text(size = 10),
        strip.text = element_text(size = 10),
        strip.background = element_rect(fill = 'wheat1'),
        legend.title = element_text(size = 8),
        legend.direction = 'horizontal',
        legend.margin = margin(c(4,15,4,4)),
        legend.text = element_text(size = 8),
        axis.text.y = element_text(size = 8),
        legend.position = c(0.71, 0.87),
        legend.background = element_rect(fill = "white",
                                         color = "black")
  ) +
  geom_segment(aes(xend = phylum,  yend = 0), color = 'grey50') +
  geom_point(size = 1.5,
             aes(color = average_completeness)) +
  geom_text(aes(label = freq, vjust = -0.65), size = 2.5) +
  facet_grid(~ domain, scales = 'free', space = 'free') +
  labs(x = 'Phylum',
       y = 'Number of MAGs per phylum\n',
       color = 'Average genome
completeness per phylum',
  ) +
  scale_color_gradientn(colors = heat_colors,
                        breaks = seq(75, 100, by = 5),
                        limits = c(79, 100)
  ) +
  scale_y_continuous(limits = c(0, 75),
                     expand = expansion(add = c(1, 0)))

 
mag_dist_plot = plot_grid(mag_dist_plot_bacteria, mag_dist_plot_archaea,
                          labels = c('a', 'b'),
                          rel_widths = c(4, 1),
                          nrow = 1,
                          label_size = 11.5,
                          align = 'hv',
                          label_x = c(0.045, 0.205))

mag_dist_plot


Supplementary Figure 2. MAG relative abundances

Process the CoverM relative abundance output files, plot relative abundance of the most abundant MAGs

mag_table = read_tsv('~/Documents/cariaco-tidy-analysis/rds_to_flatfile/flatfiles/mag_table.tsv.gz')
## Rows: 565 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): mag_name, tax_num, domain, phylum, class, order, family, genus, spe...
## dbl (3): completeness, contamination, strain_het
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
final_genome_rel_abun_tbl = read_tsv('~/Documents/cariaco-tidy-analysis/rds_to_flatfile/flatfiles/rel_abun_tibble_48_samples.tsv.gz')
## Rows: 569 Columns: 48
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (1): Genome
## dbl (47): D1b900AM, D1b900BM, D2a143A, D2a143B, D2a200A, D2a200B, D2a237A, D...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# create sample names key
metaG_sampleNames_key.X = tibble(
  sample_name = colnames(final_genome_rel_abun_tbl)[2:ncol(final_genome_rel_abun_tbl)],
  depth = str_replace(sample_name, '.*(\\d{3}).*', '\\1'),
  fraction = if_else(str_detect(sample_name, 'A'), 'PA', 'FL'),
  season = case_when(
    str_detect(sample_name, 'AM|BM') ~ 'M',
    str_detect(sample_name, 'AN|BN') ~ 'N',
    depth %in% c(103,198,234,295,314) ~ 'M',
    depth %in% c(143,200,237,247,267) ~ 'N'),
  replicate = if_else(str_detect(sample_name, 'a'), 1, 2),
  new_sample_name = paste0(season, fraction, depth, '_', replicate)
)

all(colnames(final_genome_rel_abun_tbl)[2:ncol(final_genome_rel_abun_tbl)] == metaG_sampleNames_key.X$sample_name)
## [1] TRUE
#TRUE

# color palette listing the color choices for all annotation rows/columns on the heatmap
relPalette <- list(Layer = c(
  'Oxycline' = 'darkslategray3',
  'Shallow anoxic' = 'gold',
  'Euxinic' =  'sienna3'),

  Fraction = c('PA' = 'darkgreen',
               'FL' = 'darkseagreen2'),

  Season = c('May' = 'slategray',
             'November' = 'slategray1'),

                   `Depth (m)` = c(
                     '103' = 'grey90',
                     '143' = 'grey85',
                     '198' = 'grey80',
                     '200' = 'grey75',
                     '234' = 'grey70',
                     '237' = 'grey65',
                     '247' = 'grey60',
                     '267' = 'grey55',
                     '295' = 'grey50',
                     '314' = 'grey45',
                     '900' = 'grey20'
                   ),

                   `Phylum/Class` = c('Planctomycetota' = 'wheat1',
                                      'Desulfobacterota' = 'tomato',
                                      'Myxococcota' = 'orange',
                                      'Alphaproteobacteria' = 'orchid1',
                                      'Gammaproteobacteria' = 'thistle1',
                                      'Chloroflexota' = 'slateblue1',
                                      'Marinisomatota' = 'aquamarine',
                                      'Aenigmarchaeota' = 'gold',
                                      'Krumholzibacteriota' = 'darkred',
                                      'Omnitrophota' = 'darkviolet',
                                      'Verrucomicrobiota' = 'khaki',
                                      'Thermoplasmatota' = 'firebrick1',
                                      'AABM5-125-24' = 'gainsboro',
                                      'Bacteroidota' = 'springgreen',
                                      'SAR324' = 'chocolate4',
                                      'Cloacimonadota' = 'grey28',
                                      'Crenarchaeota' = 'yellowgreen',
                                      'Acidobacteriota' = 'deepskyblue',
                                      'Actinobacteriota' = '#666666',
                                      'Eisenbacteria' = '#B89B74',
                                      'Gemmatimonadota' = 'lightsteelblue1',
                                      'Altiarchaeota' = 'ivory',
                                      'Nitrospinota' = '#2A7FB7',
                                      'Fermentibacterota' = '#1B9E77',
                                      'Latescibacterota' = '#AD4C9E',
                                      'Patescibacteria' = 'dodgerblue4',
                                      'Undinarchaeota' = 'turquoise',
                                      'Unclassified' = '#5A8950'
                   ))

mag_table = mag_table %>%
  mutate(phylum = if_else(is.na(phylum), 'Unclassified', phylum))


final_genome_rel_abun_tbl = 
  final_genome_rel_abun_tbl %>%
  rename(mag_name = Genome) %>%
  filter(!mag_name %in% c('unmapped', paste0('CarAnox_mtb2_', c(1507, 983, 769)))) %>%
  mutate(mag_name = str_replace(mag_name, 'CarAnox_mtb2', 'MAG')) %>%
  left_join(
    mag_table %>%
      select(mag_name, tax_num),
    by = 'mag_name'
  ) %>% 
  relocate(tax_num, .after = 1) %>%
  arrange(tax_num) %>%
  select(-mag_name) %>%
  column_to_rownames(var = 'tax_num') %>%
  mutate(across(1:last_col(), ~ log1p(.x)), # log-transform relative abundance percentages 
         row_sums = rowSums(.[1:ncol(.)])) %>%
  filter(row_sums > 0.005 * max(row_sums)) %>% # keep only the most abundant MAGs across the samples
  select(-row_sums) %>%
  rename_with(
    ~ metaG_sampleNames_key.X$new_sample_name,
    .cols = 1:last_col()) %>%
  relocate(
    metaG_sampleNames_key.X %>%
      arrange(desc(fraction), depth) %>%
      pull(new_sample_name),
    .after = 1)

# create character vector of phylums of the most abundant MAGs, sorted by the most to the least frequently appearing phylum
# split Proteobacteria in Alpha- and Gammaproteobacteria classes
phylum_order =
  final_genome_rel_abun_tbl %>%
  mutate(row_sums = rowSums(.),
         tax_num = rownames(.)) %>%
  left_join(
    mag_table %>%
      select(tax_num, phylum, class),
    by = 'tax_num') %>%
  select(c(row_sums, tax_num, phylum, class)) %>%
  mutate(phylum = if_else(phylum == 'Proteobacteria', class, phylum)) %>%
  select(-class) %>%
  group_by(phylum) %>%
  mutate(group_sum = sum(row_sums)) %>%
  ungroup() %>%
  arrange(desc(group_sum), phylum) %>%
  pull(phylum) %>%
  unique()

#change UAP2
phylum_order[28] = 'Undinarchaeota'

# create final relative abundance matrix to be used for plotting
final_genome_rel_abun_tbl =
  final_genome_rel_abun_tbl %>%
  mutate(row_sums = rowSums(.),
         tax_num = rownames(.)) %>%
  left_join(
    mag_table %>%
      select(tax_num, phylum, class),
    by = 'tax_num') %>%
  mutate(phylum = if_else(
    phylum == 'Proteobacteria', class, phylum)) %>%
  select(-class) %>%
  group_by(phylum) %>%
  mutate(group_sum = sum(row_sums)) %>%
  ungroup() %>%
  arrange(desc(group_sum), phylum) %>%
  select(-c(row_sums, group_sum, phylum)) %>%
  column_to_rownames(var = 'tax_num') %>%
  relocate( # function rearrange the columns by season, and water layer - incrementally increasing in depth
    colnames(.) %>%
      {function(x) {
        tbl = tibble(
          mpa = x[str_detect(x, 'MPA')],
          npa = sort(c('NPA143_2', x[str_detect(x, 'NPA')])),
          mfl = x[str_detect(x, 'MFL')],
          nfl = x[str_detect(x, 'NFL')]
        ) %>%
          mutate(groupper = rep(paste0('group_',
                                       seq(1, 6, by = 1)),
                                each = 2)) %>%
          group_by(groupper) %>%
          summarize(across(everything(), ~
                             paste0(.x, collapse = ';')),
                    .groups = 'drop') %>%
          select(-groupper)

        pa = as.vector(rbind(tbl$mpa, tbl$npa))
        fl = as.vector(rbind(tbl$mfl, tbl$nfl))

        g = c(pa, fl)
        g = str_split(g, pattern = ';') %>% unlist()
        g = g[g != 'NPA143_2']

        return(g)}}()
    ) %>%
  as.matrix()


# row metadata for relative abundance plot - the phylum/class of each MAG
row_anno =
  final_genome_rel_abun_tbl %>%
  as.data.frame() %>%
  rownames_to_column(var = 'tax_num') %>%
  select(tax_num) %>%
  left_join(
    mag_table %>%
      select(tax_num, phylum, class),
    by = 'tax_num') %>%
  as_tibble() %>%
  mutate(phylum = if_else(
    phylum == 'Proteobacteria', class, phylum)) %>%
  select(-class) %>%
  mutate(phylum = if_else(phylum == 'UAP2', 'Undinarchaeota', phylum)) %>% 
  rename(`Phylum/Class` = phylum) %>%
  arrange(factor(`Phylum/Class`, levels = phylum_order)) %>%
  column_to_rownames(var = 'tax_num')


# sort the Phylum/Class color palette by frequency
relPalette$`Phylum/Class` = relPalette$`Phylum/Class`[unique(row_anno$`Phylum/Class`)]


# column metadata for the relative abundance heatmap; annotation codings for sample fraction, sampling season, water layer, and water depth
col_anno = tibble(
  cols = colnames(final_genome_rel_abun_tbl),
  Fraction = str_replace(cols, '[A-Z]{1}([A-Z]{2})\\d{3}.*', '\\1'),
  Season = if_else(str_detect(cols, 'M'), 'May', 'November'),
  Layer = str_replace(cols, '[A-Z]+(\\d+)_.*', '\\1')
) %>%
  mutate(Layer = as.integer(Layer),
         Layer = case_when(
           Layer < 247 ~ 'Oxycline',
           Layer >= 247 & Layer < 900 ~ 'Shallow anoxic',
           TRUE ~ 'Euxinic'
         )) %>%
  mutate(`Depth (m)` = str_replace(
    cols, '.*(\\d{3}).*', '\\1'
  )) %>%
  column_to_rownames(var = 'cols') %>%
  relocate(`Depth (m)`, Fraction, Season, Layer)

# create the heatmap
final_genome_rel_abun_plot = final_genome_rel_abun_tbl %>%
  pheatmap::pheatmap(
    name = 'Relative abundance (% total reads)',
    angle_col = '315',
    fontsize = 8,
    annotation_row = row_anno,
    annotation_col = col_anno,
    annotation_colors = relPalette,
    show_colnames = FALSE,
    gaps_col = 23,
    clustering_method = 'ward.D',
    cluster_rows = FALSE,
    cluster_cols = FALSE,
    show_rownames = FALSE,
    treeheight_row = 0,
    treeheight_col = 0,
    color =
      c(colorRampPalette(colors = 'white')(1),
        colorRampPalette(colors = c(
          '#ebf4f7',
          '#aae6fa',
          '#f7e96d',
          '#f7e43b',
          'orange',
          '#ff6a00',
          'firebrick1',
          'darkorchid3',
          'purple4'))(3000)),
    legend_breaks = seq(0, 3.5, by = 0.5),
    legend_labels = ComplexHeatmap::gt_render(c(
      '0%   ',
      '0.64%   ',
      '1.72%   ',
      '3.48%   ',
      '6.39%   ',
      '11.18%   ',
      '19.09%   ',
      '32.12%   '))
  )

#final_genome_rel_abun_plot


Supplementary Figure 3. MAG differential abundance (fraction preference)

Load required libraries, create matrix of counts. Each column will represent a metagenomic sample and rows will correspond to metagenome-assembled genomes (MAGs). Each cell will contain the number of reads that mapped to the contigs of a MAG.

library(furrr) # library of purrr-style iterators that utilize multiple cores for parallel processing
library(DESeq2) # library for running differential abundance analysis
library(BiocParallel) # for DESeq2 parallel processing


# if using a multicore machine, example furrr parallel processing parameters are below:  
plan(multicore, workers = 35) # 35 cores allotted to furrr's multicore iterators
options(future.globals.maxSize = 5242880000) #5000 * 1024^2; 5Gb memory allotted to each core

# set directory to where the coverM files are located
setwd('~/path/to/github_data/coverM/dna/final_counts_of_reads_mapped_to_genomes/')

files = system('ls *.tsv', intern = TRUE) 
sample_names = str_replace(files, '_counts_reads_mapped_per_genome.tsv', '')

# Create a list of read-mapping results (in tibble format); mapped reads to the MAGs (all contigs)
# Note: furrr's iterators are implemented like purrr's with the added 'future_' prepended to the function name; it will call the iterations in parallel across multiple cores
mapped_read_counts = future_map2( 
  files,
  sample_names, ~
    vroom::vroom(
      file = .x,
      delim = '\t',
      col_names = c('mag_name', .y),
      skip = 1,
      show_col_types = FALSE
    )
)

# Join all read-mapping results together into a tibble by the column of MAG names in each list component
mapped_read_counts =
  reduce(mapped_read_counts, left_join, by = 'mag_name')


Load additional data related to the differential abundance analysis, prep the DESeq2 object and run DESeq2

mag_table = read_tsv('~/Documents/cariaco-tidy-analysis/rds_to_flatfile/flatfiles/mags_75compl_5cont.tsv.gz', show_col_types = FALSE)

taxonomic_ranks = c('domain', 'phylum', 'class', 'order', 'family', 'genus', 'species')

mag_table = mag_table |>
  rename(mag_name = bin_id) |>
  mutate(mag_name = str_replace(mag_name, 'CarAnox_mtb2-', 'MAG_')) |>
  select(mag_name, classification, completeness, contamination, strain_het) |>
  separate(classification, into = taxonomic_ranks, sep = ';') |>
  mutate(across(all_of(taxonomic_ranks), ~ str_replace(.x, '[:alpha:]__', ''))) |>
  mutate(across(all_of(taxonomic_ranks), ~ if_else(.x == '', NA_character_, .x))) |>
  mutate(phylum = if_else(is.na(phylum), 'Unclassified', phylum),
         tax_num = str_replace(mag_name, 'MAG', phylum), .after = mag_name)


# Create metadata tibble for the metagenomic samples
sampleNames_key <- tibble(
  sample_name = colnames(mapped_read_counts)[2:ncol(mapped_read_counts)]) %>%
  mutate(depth = str_replace(sample_name, '.*(\\d{3}).*', '\\1'),
         season = case_when(
           (depth == 103 | depth == 198 | depth == 234 | depth == 295 | depth == 314) ~ 'M',
           (depth == 143 | depth == 200 | depth == 237 | depth == 247 | depth == 267) ~ 'N',
                            str_detect(sample_name, 'M') ~ 'M',
                            str_detect(sample_name, 'N') ~ 'N'),
         fraction = if_else(str_detect(sample_name, 'A'), 'PA', 'FL'),
         replicate = if_else(str_detect(sample_name, 'a'), 1, 2),
         new_column_names = paste0(season, fraction, depth, '_', replicate)) %>%
  arrange(factor(sample_name, levels = colnames(select(mapped_read_counts, -mag_name))))

all(sampleNames_key$sample_name == colnames(select(mapped_read_counts, -mag_name))) #TRUE

# Create sample metadata for use in the DESeq() function. This analysis will look for differential abundance patterns between fractions in three different chemically distinct water layers: the oxycline, shallow anoxic, and euxinic depths
coldata <- sampleNames_key %>%
  select(depth, fraction, new_column_names) %>%
  mutate(layer = case_when(depth == 900 ~ 'euxinic',
                           depth >= 103 & depth <= 237 ~ 'oxycline',
                           depth > 237 & depth < 900 ~ 'shallow.anoxic')) %>%
  select(-depth) %>%
  dplyr::rename(sample = new_column_names) %>%
  unite(group, c(fraction, layer), sep = '_') %>%
  relocate(sample) %>%
  mutate(group = as_factor(group))

# Rename the mapped-read counts tibble with metagenomic sample naming scheme used in the paper
mapped_read_counts = mapped_read_counts %>%
  rename_with(
    ~ sampleNames_key$new_column_names,
    .cols = 2:last_col()) %>%
  column_to_rownames(var = 'mag_name')

# Create the DESeq2 analysis object
da_dds <- DESeq2::DESeqDataSetFromMatrix(countData = mapped_read_counts,
                                         colData = coldata,
                                         design = ~ group)

# Run the DESeq2 statistical model on the data; assumes the data have a negative binomial distribution
da_final_dds <- DESeq2::DESeq(da_dds) 


Extract the results from the oxycline, shallow anoxic, and euxinic depths that compare PA and FL fraction types

# pull Wald test results comparing the oxycline PA to FL fractions
oxycline <- DESeq2::results(da_final_dds, contrast = c('group',
                                               'PA_oxycline', 'FL_oxycline'), alpha = 0.05)

# pull Wald test results comparing the euxinic PA to FL fractions
euxinic <- DESeq2::results(da_final_dds, contrast = c('group',
                                              'PA_euxinic', 'FL_euxinic'), alpha = 0.05)

# pull Wald test results comparing the shallow anoxic PA to FL fractions
shallow_anoxic <- DESeq2::results(da_final_dds, contrast = c('group', 'PA_shallow.anoxic',
                                                     'FL_shallow.anoxic'), alpha = 0.05)

# create a comprehensive tibble including all differential abundance results
# classify MAG fraction "preference" based on the direction of log2FoldChange and the adjusted p-values
allDaResults <- list(oxycline = oxycline,
                     shallow_anoxic = shallow_anoxic,
                     euxinic = euxinic) %>%
  imap(~ .x %>%
         as.data.frame() %>%
         rownames_to_column(var = 'mag_name') %>%
         mutate(mag_name = str_replace(mag_name, 'CarAnox_mtb2', 'MAG')) %>%
         as_tibble() %>%
         mutate(preference = case_when(log2FoldChange > 0 & padj < 0.05 ~ 'particle enriched',
                                       log2FoldChange < 0 & padj < 0.05 ~ 'free-living enriched',
                                       padj > 0.05 | is.na(padj) ~ 'no significant difference'),
                layer = .y) %>%
         left_join(mag_table %>% select(mag_name, phylum), by = 'mag_name') %>%
         mutate(phylum = if_else(is.na(phylum), 'Unclassified', phylum),
                preference = factor(preference,
                                    levels = c(
                                      'particle enriched',
                                      'free-living enriched',
                                      'no significant difference'))))

# save the differential abundance final results tibble
allDaResults = allDaResults %>% 
  map_dfr(~ .x)

write_tsv(allDaResults, file = '~/Documents/cariaco-tidy-analysis/rds_to_flatfile/flatfiles/deseq2_results_by_water_layer.tsv.gz')


Plot the differential abundance results

library(tidyverse) # loads several useful data-wrangling and plotting libraries
library(gridtext) # for editing text in heatmap plots
library(ComplexHeatmap) # for heatmap plots
library(pheatmap) # for heatmap plots
library(cowplot) # for combining various plots together in grids
library(ggtext) # used for ggplots
library(glue) # used for ggplots


#locally load DESeq2 results
allDaResults = read_tsv(file = '~/Documents/cariaco-tidy-analysis/rds_to_flatfile/flatfiles/deseq2_results_by_water_layer.tsv.gz')
## Rows: 1704 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): mag_name, preference, layer, phylum
## dbl (6): baseMean, log2FoldChange, lfcSE, stat, pvalue, padj
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
allDaResults = allDaResults %>% 
  group_by(layer) %>% 
  group_split() %>% 
  set_names(map_chr(., ~ unique(.x$layer)))

## plotting
magFractionPref <- allDaResults %>%
  map_dfr(~
            .x %>%
              mutate(layer = case_when(
                layer == 'oxycline' ~ 'Oxycline',
                layer == 'euxinic' ~ 'Euxinic',
                layer == 'shallow_anoxic' ~ 'Shallow anoxic',
              ))
          )

magFractionPref_plot = magFractionPref %>%
  ggplot(aes(y = fct_rev(fct_infreq(phylum)),
             fill = fct_relevel(preference, c('no significant difference',
                                              'free-living enriched',
                                              'particle enriched')))) +
  geom_bar() +
  labs(x = '\nNumber of genomes', y = 'Phylum\n', fill = 'Fraction preference') +
  theme_bw() +
  scale_fill_manual(values = c('grey80', 'palegreen3', 'burlywood2')) +
  facet_wrap(~ fct_relevel(layer, c('Oxycline', 'Shallow anoxic', 'Euxinic'))) +
  theme(
    axis.title = element_text(size = 10),
    strip.text = element_text(size = 10),
    strip.background = element_rect(fill = 'wheat1'),
    panel.grid = element_blank(),
    legend.position = 'top',
    legend.justification = c(0.92,0)
  )
magFractionPref_plot


Supplementary Figure 4. Distribution of biosynthetic gene clusters identified using antiSMASH 6.0

bgc_summary = read_tsv('~/Documents/cariaco-tidy-analysis/rds_to_flatfile/flatfiles/bgc_summary.tsv.gz')
## Rows: 1553 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): product, contig, core_location, detection_rule, filename, filepath
## dbl (2): start, end
## lgl (1): contig_edge
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
bgc_10k_kb = bgc_summary %>%
  mutate(size = end - start, .after = contig) %>%
  filter(size >= 10000)

bgc_count_data = bgc_10k_kb %>%
  mutate(mag_name = str_replace(contig, '(MAG_\\d+)_.*', '\\1'), .before = contig) %>%
  filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983')) %>% # remove MAGs that are from laboratory contamination
  select(-mag_name) %>%
  mutate(clust = paste0('clust_', 1:n()), .before = size) %>%
  group_by(contig) %>%
  group_split() %>%
  map_dfr(~ .x %>%
            mutate(ID.match = map2(
              1:length(start), list(.), ~
                .y %>%
                filter(
                  start > start[.x] & start < end[.x] | # start of the BGC is greater than your start, and less than your end
                    start < start[.x] & end > start[.x] | # start of the BGC is less than your start, and end is greater than your start
                    start == start[.x] & start < end[.x]) %>% # start of the BGC is equal to your start, and less than your end
                pull(clust)),
            .before = start)) %>%
  group_by(contig) %>%
  mutate(concat = list(unlist(ID.match)), .after = ID.match) %>%
  mutate(concat = map(concat, ~ unique(.x[duplicated(.x)]))) %>%
  mutate(length_concat = length(concat), .after = concat) %>%
  mutate(n = n(), .after = length_concat) %>%
  ungroup()


all(bgc_count_data$length_concat == bgc_count_data$n) # TRUE; all contigs w/ multiple BGCs have some level of overlap amongst them; can classify these all as 'hybrid clusters' of various sorts
## [1] TRUE
bgc_count_data = bgc_count_data %>%
  select(-c(ID.match, concat, length_concat)) %>%
  group_by(contig) %>%
  mutate(is_hybrid = if_else(n() > 1, TRUE, FALSE), .after = contig)


bgc_count_data_summary = bgc_count_data %>%
  ungroup() %>%
  select(-detection_rule) %>%
  mutate(product = str_replace(product, '-like', ''),
         product = str_replace(product, '.*PKS', 'PKS')) %>%
  arrange(contig, product) %>%
  group_by(contig, is_hybrid) %>%
  select(-c(filename, filepath)) %>%
  summarize(across(everything(), ~ paste0(.x, collapse = ';')), .groups = 'drop') %>%
  mutate(product = if_else(
    str_detect(product, ';'), str_replace(product, '(.*)', '\\1 hybrid'), product),
    product = str_replace_all(product, ';', '-')
  )

bgc_count_data_summary = bgc_count_data_summary %>%
  mutate(
    product = case_when(
      !is_hybrid & str_detect(product, 'lanthipeptide|RRE-containing|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides') ~ 'RiPP',
      !is_hybrid & str_detect(product, 'lactone') ~ 'Lactone',
      !is_hybrid & str_detect(product, 'acyl_amino_acids|nucleoside|PBDE|indole|siderophore|resorcinol|ectoine|NAPAA|CDPS|LAP|redox-cofactor') ~ 'other',
      !is_hybrid & str_detect(product, 'hglE-KS') ~ 'PKS',
      !is_hybrid & str_detect(product, 'arylpolyene') ~ 'Aryl polyene',
      !is_hybrid & str_detect(product, 'redox-cofactor') ~ 'Redox-cofactor',
      #
      is_hybrid & str_detect(product, 'NRPS') & str_detect(product, 'PKS|hglE-KS') ~ 'NRPS-PKS hybrid',
      is_hybrid & str_detect(product, 'NRPS') & !str_detect(product, 'PKS|hglE-KS|lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP') ~ 'NRPS hybrid',
      is_hybrid & str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP') & !str_detect(product, 'PKS|NRPS|hglE-KS') ~ 'RiPP hybrid',
      is_hybrid & str_detect(product, 'PKS|hglE-KS') & !str_detect(product, 'NRPS|lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP') ~ 'PKS hybrid',
      is_hybrid & str_detect(product, 'NRPS') & str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP') & !str_detect(product, 'PKS|hglE-KS') ~ 'NRPS-RiPP hybrid',
      is_hybrid & str_detect(product, 'PKS|hglE-KS') & str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP') & !str_detect(product, 'NRPS') ~ 'PKS-RiPP hybrid',
      is_hybrid & str_detect(product, 'terpene') & !str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP|PKS|NRPS|hglE-KS') ~ 'Terpene hybrid',
      is_hybrid & str_detect(product, 'ladderane') & !str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP|PKS|NRPS|hglE-KS|terpene') ~ 'Ladderane hybrid',
      is_hybrid & str_detect(product, 'arylpolyene') & !str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP|PKS|NRPS|hglE-KS|terpene') ~ 'Aryl polyene hybrid',
      is_hybrid & str_detect(product, 'lactone') & !str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP|PKS|NRPS|hglE-KS|terpene') ~ 'Lactone hybrid',
      is_hybrid & str_detect(product, 'CDPS') & !str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP|PKS|NRPS|hglE-KS|terpene') ~ 'Other hybrid',
      #
      TRUE ~ product
    )) %>%
  mutate(product = if_else(str_detect(product, '^[:lower:]'), str_to_title(product), product))


bgc_count_data_summary = bgc_count_data_summary %>%
  mutate(mag_name = str_replace(contig, '(MAG_\\d+)_\\d+', '\\1'), .before = contig) %>% #create mag name col
  left_join(mag_table %>% select(mag_name, domain, phylum), by = 'mag_name') %>% # add phylum, domain cols
  relocate(c(domain, phylum), .after = 1) %>% # relocate cols
  select(c(phylum, domain, product)) %>% # select some cols
  mutate(n_product_in_phylum = 1,
         product = if_else(str_detect(product, 'hybrid'), 'Hybrid cluster', product)) %>% # edit product col, add sum col, sm class
  group_by(phylum, domain, product) %>%
  summarize(n_product_in_phylum = sum(n_product_in_phylum), .groups = 'drop') %>% # sum up each type of bgc in each phylum
  left_join(mag_table %>%
              select(phylum) %>%
              group_by(phylum) %>%
              summarize(n_mags_in_phylum = n()),
            by = 'phylum') %>% # join col that says how many MAGs are in each phylum
  mutate(norm_sm_freq = n_product_in_phylum / n_mags_in_phylum, # divide #bgc / # mag per phylum for each bgc per phylum
         product = if_else(product == 'Other', 'Other cluster*', product)) %>%
  #
  group_by(phylum) %>%
  add_tally(sum(norm_sm_freq), name = 'total_sm_freq') %>%
  ungroup() %>%
  arrange(desc(total_sm_freq), phylum) %>%
  mutate(phylum = if_else(is.na(phylum), 'Unclassified', phylum))


# Pie chart of Cariaco MAG BGCs groupped by class -------------------------

final_pie =
  bgc_count_data_summary %>%
  mutate(product = if_else(product == 'PKS', 'Polyketide', product)) %>%
  select(product, n_product_in_phylum) %>%
  group_by(product) %>%
  summarize(n_product_in_phylum = sum(n_product_in_phylum)) %>%
  left_join(
    tibble(
      product = names(smc_palette),
      color = unname(smc_palette)),
    by = 'product')


library(plotly) # load plotly for plotting
## 
## Attaching package: 'plotly'
## 
## The following object is masked _by_ '.GlobalEnv':
## 
##     highlight
## 
## The following object is masked from 'package:ComplexHeatmap':
## 
##     add_heatmap
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
# create the pie chart and save it
final_bgc_pie = plot_ly(final_pie,
                  labels = ~product,
                  values = ~n_product_in_phylum,
                  type = 'pie',
                  textposition = 'inside',
                  textinfo = 'label+percent+value',
                  insidetextfont = list(color = '#FFFFFF'),
                  text = ~n_product_in_phylum,
                  marker = list(colors = ~color,
                                line = list(
                                  color = '#FFFFFF',
                                  width = 1)),
                  showlegend = FALSE) %>%
  layout(xaxis = list(
    showgrid = FALSE,
    zeroline = FALSE,
    showticklabels = FALSE),
    yaxis = list(
      showgrid = FALSE,
      zeroline = FALSE,
      showticklabels = FALSE))

final_bgc_pie


Processing of RNA-Seq read-mapping .paf files from Minimap2

Process the .paf output files

# note: this section is not run, but to view the output file, run the read_tsv() command at the end of this code chunk


# load required libraries
library(tidyverse)
library(furrr)

#set parallel processing params
plan(multicore, workers = 48)
options(future.globals.maxSize = 52428800000)

# metadata file of gene names
gene_metadata = tibble(
  gene_name = readLines('/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/FINAL_MAY_2022_ALL_ANTISMASH6_CONCAT_GENE_FASTA_INDEX/final_concat_gene_names.txt')
) %>%
  separate(gene_name,
           into = c('gene_name', 'start', 'end', 'strand', 'extra_data'),
           sep = ' # ')

gene_metadata = gene_metadata %>%
  mutate(gene_name = str_replace(gene_name, '^>', ''))

empty_gene_counts_filler = gene_metadata %>%
  select(gene_name) %>%
  mutate(n_reads_mapped = 0L)

rm(gene_metadata)

# set working directory to the one with the .paf output files
setwd('/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/minimap2_metaT_mapping_final_results_used')

# create vector of .paf full file paths
file_paths = system('ls *.paf', intern = TRUE)
sample_names = str_replace(file_paths, '.paf', '')

#function to process the .paf files
prepare_read_mappings = function(.empty_gene_counts_filler, .file_path) {

  mapping_results = tibble(temp = readLines(.file_path))

  mapping_results[c('qname', 'qlen', 'qstart', 'qend',
                    'strand', 'tname', 'tlen', 'tstart',
                    'tend', 'nmatch', 'alen', 'mapq',
                    'align_type', 'X14', 'X15', 'X16', 'X17')] <- str_split_fixed(mapping_results$temp, '\t', 17)

  mapping_results = mapping_results %>%
    select(-temp) %>% # remove temp col containing whole line information
    select(1:13) %>% # select columns 1:13
    mutate(across(c(qlen, qstart, qend,
                    tlen, tstart, tend,
                    nmatch, alen), ~ as.integer(.x)))

  # filter to keep only primary alignment results
  mapping_results = mapping_results %>%
    filter(
      align_type == 'tp:A:P', # retain primary alignments; note: there are no secondary/suppl. alignments in any .paf files
      mapq != 255, # filter out any reads that did not map; note: there were no such scores in any .paf files
      alen >= 0.5 * qlen, # the alignment length is at least 50% of the read length
      nmatch >= 0.95 * alen) %>%  # at least 95% of the aligned bases are a match
    group_by(qname) %>%
    filter(mapq == max(mapq)) %>%  # for any read with multiple primary mappings, keep alignment with highest score
    slice_sample(n = 1) %>%
    ungroup()

  mapping_results = mapping_results %>%
    group_by(tname) %>% # create groupped tibble; groupped by target sequences (genes)
    summarize(n_reads_mapped = n()) # aggregate the tibble, summing up counts of mapped reads

  # bind the counts of mapped reads to genes with all the other genes, counted zero times
  mapping_results = mapping_results %>%
    rename(gene_name = tname) %>%
    bind_rows(
      filter(.empty_gene_counts_filler, !(gene_name %in% .$gene_name))
    )
  
  return(mapping_results)
}

# process .paf files using parallel processing
processed_mapping_counts = future_map2(
  .options = furrr_options(seed = 111),
  list(empty_gene_counts_filler),
  file_paths, ~
    prepare_read_mappings(.x, .y)
)

processed_mapping_counts_colnames = names(processed_mapping_counts)

processed_mapping_counts = processed_mapping_counts %>% 
  reduce(left_join, by = 'gene_name') %>% 
  rename_with(~ processed_mapping_counts_colnames, .cols = 2:last_col())
  

# save the list of processed files
write_tsv(processed_mapping_counts, file = '~/Documents/cariaco-tidy-analysis/rds_to_flatfile/flatfiles/tbl_of_mapped_metaT_reads_from_each_sample.tsv.gz')


# to view the output:
processed_mapping_counts = read_tsv('~/Documents/cariaco-tidy-analysis/rds_to_flatfile/flatfiles/tbl_of_mapped_metaT_reads_from_each_sample.tsv.gz')


Finish processing the read-mapping results, calculate normalized Transcript Per Million (TPM) values for the final dataset

# create tibble that has all metatranscriptome read-mapping results joined together
processed_mapping_counts =
  reduce(processed_mapping_counts, left_join, by = 'gene_name') %>% 
  rename_with( # rename the meteatranscriptome sample columns based on naming scheme used in the paper
    ~ sample_names,
    .cols = 2:last_col()) %>%
  mutate(gene_name = str_replace(
    gene_name, '(MAG_\\d+)_(\\d+_\\d+)', '\\1-ctg\\2'))

#read in gene lengths data
gl = read_tsv('~/Documents/cariaco-tidy-analysis/rds_to_flatfile/flatfiles/gene_lengths_tibble.tsv.gz')


gl = gl %>%
  mutate(gene_length = gene_length / 1000) # convert unit length to kilobases

# join gene length data to read-mapping data
# divide each read-mapping count by the length of the gene that it mapped to
processed_mapping_counts = processed_mapping_counts %>%
  left_join(gl, by = 'gene_name') %>%
  relocate(gene_length, .after = 1) %>%
  mutate(across(3:last_col(), ~ .x / gene_length))

# function to convert RNA-seq counts to TPM; TPM is normalized across and within samples
# the counts tibble has already had each cell divided by gene length
# this function will calculate the TPM scaling factor for each column and divide each cell in a column by the scaling factor, for all columns
get_tpm_for = function(col) {
  scaling_factor = sum(col) / 1e6
  col = col / scaling_factor
  return(col)
}

# finalize the TPM normalization
processed_mapping_counts = processed_mapping_counts %>%
  select(-gene_length) %>%
  mutate(across(2:last_col(), ~ get_tpm_for(.x)))

# read in biosynthetic gene cluster (BGC) metadata, parsed from antiSMASH 6 output
bgc = vroom::vroom(
  '/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/salmon_related_data_files/bgc_genes_from_10kb_clusters.txt',
  show_col_types = FALSE)

bgc = bgc %>%
  mutate(gene_name = str_replace(
    gene_name,
    '(^ctg\\d+_\\d+)-(MAG.*)', '\\2-\\1'
  ))

# filter the TPM-normalized read-mapping data to retain only biosynthetic gene cluster
# transcript mappings from biosynthetic clusters that were at least 10 kilobases in size
processed_mapping_counts = processed_mapping_counts %>%
  filter(gene_name %in% bgc$gene_name)


Processing of DNA-Seq read-mapping .paf files from Minimap2

Process the .paf output files

# note: this section is not run, but to view the output file, run the read_tsv() command at the end of this code chunk

# load required libraries
library(tidyverse)
library(furrr)

# set working directory to the one with the .paf output files
setwd('/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/minimap2_metaG_mapping_final_results_used')

#set parallel processing params
plan(multicore, workers = 19)
#(125000 * 1024 ^2) %>% clipr::write_clip()
options(future.globals.maxSize = 1.31072e+11) #125000 * 1024 ^2

empty_gene_counts_filler = read_tsv('~/Documents/cariaco-tidy-analysis/rds_to_flatfile/flatfiles/metaG_empty_gene_counts_filler.tsv.gz')

# create vector of .paf full file paths
file_paths = system('ls *.paf', intern = TRUE)
sample_names = str_replace(file_paths, '.paf', '')

#function to process the .paf files
prepare_read_mappings = function(.empty_gene_counts_filler, .file_path) {

  mapping_results = tibble(temp = readLines(.file_path))

  mapping_results[c('qname', 'qlen', 'qstart', 'qend',
                    'strand', 'tname', 'tlen', 'tstart',
                    'tend', 'nmatch', 'alen', 'mapq',
                    'align_type', 'X14', 'X15', 'X16', 'X17')] <- str_split_fixed(mapping_results$temp, '\t', 17)

  mapping_results = mapping_results |>
    select(-temp) |> # remove temp col containing whole line information
    select(1:13) |> # select columns 1:13
    mutate(across(c(qlen, qstart, qend,
                    tlen, tstart, tend,
                    nmatch, alen), ~ as.integer(.x)))

  # filter to keep only top primary alignment results
  mapping_results = mapping_results |>
    filter(
      align_type == 'tp:A:P', # keep primary alignments
      mapq != 255, # filter out any reads that did not map
      alen >= 0.5 * qlen, # the alignment length is at least 50% of the read length
      nmatch >= 0.95 * alen) |>  # at least 95% of the aligned bases are a match
    group_by(qname) |>
    filter(mapq == max(mapq)) |>  # for any read with multiple primary mappings, keep alignment with highest score
    slice_sample(n = 1) |>
    ungroup()

  mapping_results = mapping_results %>%
    group_by(tname) %>% # create groupped tibble; groupped by target sequences (genes)
    summarize(n_reads_mapped = n()) # aggregate the tibble, summing up counts of mapped reads

  # bind the counts of mapped reads to genes with all the other genes, counted zero times
  mapping_results = mapping_results %>%
    rename(gene_name = tname) %>%
    bind_rows(
      filter(.empty_gene_counts_filler, !(gene_name %in% .$gene_name))
    )

  return(mapping_results)
}

# process .paf files using parallel processing
processed_mapping_counts = future_map2(
  list(empty_gene_counts_filler),
  file_paths, ~
    prepare_read_mappings(.x, .y)
)

processed_mapping_counts_colnames = 
  c("MFL103_1","MFL103_2","MFL198_1","MFL198_2","MFL234_1","MFL234_2","MFL295_1","MFL295_2","MFL314_1","MFL314_2","MFL900_1",
        "MFL900_2","MPA103_1","MPA103_2","MPA198_1","MPA198_2","MPA234_1","MPA234_2","MPA295_1","MPA295_2","MPA314_1","MPA314_2",
        "MPA900_1","MPA900_2","NFL148_1","NFL148_2","NFL200_1","NFL200_2","NFL237_1","NFL237_2","NFL247_1","NFL247_2","NFL267_1",
        "NFL267_2","NFL900_1","NFL900_2","NPA148_1","NPA148_2","NPA200_1","NPA200_2","NPA237_1","NPA237_2","NPA247_1","NPA247_2",
        "NPA267_1","NPA267_2","NPA900_1","NPA900_2")

# rename samples to use sample names used in the paper
processed_mapping_counts = processed_mapping_counts %>% 
  reduce(left_join, by = 'gene_name') %>% 
  rename_with(~ processed_mapping_counts_colnames, .cols = 2:last_col())
  

# save the list of processed files
write_tsv(processed_mapping_counts, file = '~/Documents/cariaco-tidy-analysis/rds_to_flatfile/flatfiles/tbl_of_mapped_metaT_reads_from_each_sample.tsv.gz')


# to view the output:
processed_mapping_counts = read_tsv('~/Documents/cariaco-tidy-analysis/rds_to_flatfile/flatfiles/tbl_of_mapped_metaT_reads_from_each_sample.tsv.gz')


Differential gene expression analysis

Prepare gene expression matrix for DESeq2 differential expression analysis

# deseq2 ALL PA DEPTHS VS ALL FL DEPTHS run on minimap2 counts files -- 95% min percent identity of alignments at least 50% of read length aligned  --------

# load list of tibbles - each tibble containing the read-mapping counts to genes from each metatranscriptomic sample
mapped_read_counts = read_tsv('~/Documents/cariaco-tidy-analysis/rds_to_flatfile/flatfiles/tbl_of_mapped_metaT_reads_from_each_sample.tsv.gz')

mapped_read_counts =
  mapped_read_counts %>%
  mutate(gene_name = str_replace(
    gene_name, '(MAG_\\d+)_(\\d+_\\d+)', '\\1-ctg\\2'))  %>%
  as.data.frame() %>%
  column_to_rownames(var = 'gene_name')

# create column metadata for use with the DESeq() function
col_data = tibble(sample = colnames(mapped_read_counts)) %>%
  mutate(fraction = if_else(str_detect(sample, 'PA'), 'PA', 'FL'),
         depth = str_replace(sample, '[:ALPHA:]{3}(\\d+)_\\d', '\\1'),
         depth = as.integer(depth),
         layer = case_when(depth == 900 ~ 'euxinic',
                           depth >= 103 & depth <= 237 ~ 'oxycline',
                           depth > 237 & depth < 900 ~ 'shallow.anoxic'),
         group = paste0(fraction, '_', layer)) %>%
  select(-c(fraction, depth, layer)) %>%
  mutate(group = str_replace(group, '(.*)_.*', '\\1')) %>%
  mutate(group = as_factor(group))

all(colnames(mapped_read_counts) == col_data$sample) # TRUE

# load a tibble containing the length in bp of each gene from all Cariaco MAGs
gl = read_tsv('~/Documents/cariaco-tidy-analysis/rds_to_flatfile/flatfiles/gene_lengths_tibble.tsv.gz')


gl = gl %>%
  mutate(gene_length = gene_length / 1000) # convert length unit to kilobases

logTransformed_mapping_tpm = mapped_read_counts %>%
  rownames_to_column(var = 'gene_name') %>%
  as_tibble() %>%
  left_join(gl, by = 'gene_name') %>%
  relocate(gene_length, .after = 1) %>%
  mutate(across(3:last_col(), ~ .x / gene_length))


get_tpm_for = function(col) {
  scaling_factor = sum(col) / 1e6
  col = col / scaling_factor
  return(col)
}

logTransformed_mapping_tpm = logTransformed_mapping_tpm %>%
  select(-gene_length) %>%
  mutate(across(2:last_col(), ~ get_tpm_for(.x))) %>%
  mutate(across(2:last_col(), ~ log1p(.x)))

bgc = vroom::vroom('~/Documents/cariaco-tidy-analysis/rds_to_flatfile/flatfiles/bgc_data/bgc_genes_from_10kb_clusters.txt',
                   show_col_types = FALSE)

bgc = bgc %>%
  mutate(gene_name = str_replace(
    gene_name,
    '(^ctg\\d+_\\d+)-(MAG.*)', '\\2-\\1'
  ))

logTransformed_mapping_tpm = logTransformed_mapping_tpm %>%
  filter(gene_name %in% bgc$gene_name)



da_dds <- DESeq2::DESeqDataSetFromMatrix(mapped_read_counts,
                                         colData = col_data,
                                         design = ~ group)

register(MulticoreParam(10))

da_final_dds <- DESeq2::DESeq(da_dds, parallel = TRUE) # run the DESeq2 statistical model which assumes a negative binomial 

# pull Wald test results comparing the oxycline PA to FL fractions
res <- DESeq2::results(da_final_dds,
                       contrast = c('group',
                                    'PA',
                                    'FL'),
                       alpha = 0.05)


final_res <- res %>%
  as.data.frame() %>%
  rownames_to_column(var = 'mag_name') %>%
  as_tibble() %>%
  mutate(enrichment = case_when(
    log2FoldChange > 0 & padj < 0.05 ~ 'particle enriched',
    log2FoldChange < 0 & padj < 0.05 ~ 'free-living enriched',
    padj > 0.05 | is.na(padj) ~ 'no significant difference'))


col_metadata = tibble(sample = colnames(mapped_read_counts)[
  2:ncol(mapped_read_counts)
  ]) %>%
  mutate(fraction = if_else(str_detect(sample, 'PA'), 'PA', 'FL'),
         depth = str_replace(sample, '[:ALPHA:]{3}(\\d+)_\\d', '\\1'),
         depth = as.integer(depth),
         layer = case_when(depth == 900 ~ 'euxinic',
                           depth >= 103 & depth <= 237 ~ 'oxycline',
                           depth > 237 & depth < 900 ~ 'shallow.anoxic'))

mapped_read_counts = mapped_read_counts %>%
  rownames_to_column(var = 'gene_name') %>%
  as_tibble()

cs = mapped_read_counts %>%
  mutate(mag_name = str_replace(gene_name, '(MAG_\\d+)-.*', '\\1'), .before = 1)

cs = cs %>%
  mutate(
    fl_counts = rowSums(select(., col_metadata %>%
                                 filter(fraction == 'FL') %>%
                                 pull(sample))),
    pa_counts = rowSums(select(., col_metadata %>%
                                 filter(fraction == 'PA') %>%
                                 pull(sample)))
  ) %>%
  relocate(c(fl_counts, pa_counts), .after = gene_name) %>%
  select(1:8)



#bgc = readRDS('/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/rdata/bgc_kb_5000_gene_data.rda')


bgc_uniq = bgc %>%
  select(mag_name, gene_name) %>%
  distinct() %>%
  mutate(is_biosynthetic = TRUE)


sig_genes = res %>%
  as.data.frame() %>%
  rownames_to_column(var = 'gene_name') %>%
  as_tibble() %>%
  left_join(bgc_uniq %>%
              select(-mag_name),
            by = 'gene_name')

#pa_sig
sig_genes %>%
  filter(is_biosynthetic, log2FoldChange > 0, padj <= 0.05) %>%
  mutate(is_sig_pa_biosynthetic = TRUE) %>%
  select(gene_name, is_sig_pa_biosynthetic)


#fl_sig
sig_genes %>%
  filter(is_biosynthetic, log2FoldChange < 0, padj <= 0.05) %>%
  mutate(is_sig_fl_biosynthetic = TRUE) %>%
  select(gene_name, is_sig_fl_biosynthetic)




clust_analysis = pmap_df(list(
  list(sig_genes),

  list(sig_genes %>%
         filter(is_biosynthetic, log2FoldChange > 0, padj <= 0.05) %>%
         mutate(is_sig_pa_biosynthetic = TRUE) %>%
         select(gene_name, is_sig_pa_biosynthetic)),

  list(sig_genes %>%
         filter(is_biosynthetic, log2FoldChange < 0, padj <= 0.05) %>%
         mutate(is_sig_fl_biosynthetic = TRUE) %>%
         select(gene_name, is_sig_fl_biosynthetic)),

  list(
    cs %>% select(2:4)
  )
), ~
  ..1 %>%
  left_join(..2, by = 'gene_name') %>%
  left_join(..3, by = 'gene_name') %>%
  left_join(..4, by = 'gene_name')
)

clust_data_pa = clust_analysis %>%
  filter(is_sig_pa_biosynthetic |
           (pa_counts > 0 & fl_counts == 0 & is_biosynthetic)
  ) %>%
  relocate(c(fl_counts, pa_counts, padj,
             is_sig_pa_biosynthetic,
             is_sig_fl_biosynthetic,
             is_biosynthetic), .after = gene_name) %>%
  left_join(bgc, by = 'gene_name')

clust_data_fl = clust_analysis %>%
  filter(is_sig_fl_biosynthetic |
           (fl_counts > 0 & pa_counts == 0 & is_biosynthetic)
  ) %>%
  relocate(c(fl_counts, pa_counts, padj,
             is_sig_pa_biosynthetic,
             is_sig_fl_biosynthetic,
             is_biosynthetic), .after = gene_name) %>%
  left_join(bgc, by = 'gene_name')


bgc_summary = bgc %>%
  group_by(contig, region) %>%
  summarize(n = n(), .groups = 'drop')


pa_expressed = clust_data_pa %>%
  left_join(bgc_summary, by = c('contig', 'region')) %>%
  group_by(contig, region) %>%
  rename(cluster_size = n) %>%
  add_tally() %>%
  ungroup() %>%
  mutate(proportion_expressed = n / cluster_size)

pa_expressed_summary = pa_expressed %>%
  select(product, proportion_expressed,
         contig, region, cluster_size, n) %>%
  distinct()

fl_expressed = clust_data_fl %>%
  left_join(bgc_summary, by = c('contig', 'region')) %>%
  group_by(contig, region) %>%
  rename(cluster_size = n) %>%
  add_tally() %>%
  ungroup() %>%
  mutate(proportion_expressed = n / cluster_size)


fl_expressed_summary = fl_expressed %>%
  select(product, proportion_expressed,
         contig, region, cluster_size, n) %>%
  distinct()


####
map_chr(ls(), ~ paste0(.x, ': ', object.size(get(.x)) %>% format(units = "Mb")))

final_de_and_clust_data = pa_expressed %>%
  bind_rows(fl_expressed)

test_final_de = read_tsv('~/Documents/cariaco-tidy-analysis/rds_to_flatfile/flatfiles/final_de_and_uniq_expr_clust_data.tsv.gz')


#PA
test_final_de %>%
  filter(
    (padj < 0.05 & log2FoldChange > 0) |
      (pa_counts > 0 & fl_counts == 0)) %>%
  left_join(
    mag_table %>%
      select(mag_name, phylum, class),
    by = 'mag_name') %>%
  pull(phylum) %>%
  table() %>%
  sort()


#FL
test_final_de %>%
  filter(
    (padj < 0.05 & log2FoldChange < 0) |
      (fl_counts > 0 & pa_counts == 0)) %>%
  left_join(
    mag_table %>%
      select(mag_name, phylum, class),
    by = 'mag_name') %>%
  pull(phylum) %>%
  table() %>%
  sort()


test_final_de %>%
  filter(pa_counts > 0 & fl_counts == 0) %>%
  nrow()

test_final_de %>%
  filter(fl_counts > 0 & pa_counts == 0) %>%
  nrow()


clust_gene_summary = read_tsv('~/Documents/cariaco-tidy-analysis/rds_to_flatfile/flatfiles/clust_gene_summary.tsv.gz')
## Rows: 77941 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (10): mag_name, filename, contig, product, type, ID, gene_identifier, aS...
## dbl  (3): start, end, evalue
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
allDaResults = allDaResults %>%
  map_dfr(~ .x)

allDaResults = allDaResults %>%
  group_by(mag_name) %>%
  mutate(strict_fraction_preference = case_when(
    all(preference == 'particle enriched') ~ 'PA',
    all(preference == 'free-living enriched') ~ 'FL',
    TRUE ~ 'no strict preference'), .after = 1) %>%
  ungroup() %>%
  arrange(phylum, mag_name)

strict_pa_mags = allDaResults %>%
  filter(strict_fraction_preference == 'PA') %>%
  pull(mag_name) %>%
  unique()

strict_fl_mags = allDaResults %>%
  filter(strict_fraction_preference == 'FL') %>%
  pull(mag_name) %>%
  unique()

# fl transporter summary tibble
fl_transporter_summary =
  clust_gene_summary %>%
  filter(mag_name %in% strict_fl_mags) %>%
  left_join(mag_table %>%
              select(mag_name, phylum),
            by = 'mag_name') %>%
  group_by(contig, product) %>%
  mutate(contains_transporter = if_else(str_detect(description, 'transport') | str_detect(gene_functions, 'transport'), TRUE, FALSE),
         .after = product) %>%
  mutate(contains_transporter = if_else(is.na(contains_transporter), FALSE, contains_transporter)) %>%
  select(mag_name, phylum, contig, product, contains_transporter) %>%
  mutate(temp = if_else(any(contains_transporter), TRUE, FALSE)) %>%
  select(-contains_transporter) %>%
  ungroup() %>%
  rename(contains_transporter = temp) %>%
  distinct()



# pa transporter summary tibble
pa_transporter_summary =
  clust_gene_summary %>%
  filter(mag_name %in% strict_pa_mags) %>%
  left_join(mag_table %>%
              select(mag_name, phylum),
            by = 'mag_name') %>%
  group_by(contig, product) %>%
  mutate(contains_transporter = if_else(str_detect(description, 'transport') | str_detect(gene_functions, 'transport'), TRUE, FALSE),
         .after = product) %>%
  mutate(contains_transporter = if_else(is.na(contains_transporter), FALSE, contains_transporter)) %>%
  select(mag_name, phylum, contig, product, contains_transporter) %>%
  mutate(temp = if_else(any(contains_transporter), TRUE, FALSE)) %>%
  select(-contains_transporter) %>%
  ungroup() %>%
  rename(contains_transporter = temp) %>%
  distinct()


###
pa_transporter_summary %>%
  group_by(phylum) %>%
  summarize(
    has_transporter = sum(contains_transporter),
    no_transporter = sum(!contains_transporter)
  )
###
fl_transporter_summary %>%
  group_by(phylum) %>%
  summarize(
    has_transporter = sum(contains_transporter),
    no_transporter =  sum(!contains_transporter)) %>%
  print(n = 30)
## # A tibble: 25 × 3
##    phylum           has_transporter no_transporter
##    <chr>                      <int>          <int>
##  1 AABM5-125-24                   0              4
##  2 Acidobacteriota                3             19
##  3 Aenigmarchaeota                1              0
##  4 Altiarchaeota                  1              0
##  5 Bacteroidota                  10              2
##  6 Campylobacterota               1              0
##  7 Chloroflexota                  6             20
##  8 Cyanobacteria                  2              0
##  9 Desulfobacterota              11             27
## 10 Gemmatimonadota                2              4
## 11 Iainarchaeota                  0              1
## 12 Latescibacterota               8             14
## 13 Marinisomatota                 5              2
## 14 Nanoarchaeota                 10             13
## 15 Nitrospinota                   1              5
## 16 Omnitrophota                  50             58
## 17 Patescibacteria                1              0
## 18 Planctomycetota                9             13
## 19 Proteobacteria                47             48
## 20 SAR324                         7             15
## 21 Spirochaetota                  0              1
## 22 Thermoplasmatota               1              0
## 23 UAP2                           0              1
## 24 UBA10199                       6              4
## 25 <NA>                           5              1
#####
logTransformed_tpm_for_bgc_genes_minimap2_res = read_tsv('~/Documents/cariaco-tidy-analysis/rds_to_flatfile/flatfiles/logTransformed_tpm_for_bgc_genes_minimap2_res.tsv.gz')
## Rows: 23843 Columns: 49
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (1): gene_name
## dbl (48): MFL103_1, MFL103_2, MFL198_1, MFL198_2, MFL234_1, MFL234_2, MFL295...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# temp_FL_rowAnno = temp_FL_rowAnno %>%
#   mutate(temp = as.numeric(contains_transporter))

bgc_genes_10kb =
  bgc_genes_10kb %>%
  mutate(gene_name = str_replace(gene_name, '(ctg.*_\\d+)-(MAG_\\d+)', '\\2-\\1'))


# tpm of each gene from clusters >= 10kb in size -- FL
temp_FL_rowAnno = logTransformed_tpm_for_bgc_genes_minimap2_res %>%
  select(gene_name, contains('FL')) %>%
  mutate(row_sums = rowSums(.[2:ncol(.)])) %>%
  left_join(
    bgc_genes_10kb %>%
      left_join(fl_transporter_summary %>%
                  select(contig, product, contains_transporter),
                by = c('contig', 'product')) %>%
      filter(!is.na(contains_transporter)) %>%
      select(-c(cluster_kb, contig_edge)),
    by = 'gene_name') %>%
  filter(!is.na(contains_transporter)) %>%
  filter(row_sums > 0) %>%
  arrange(contains_transporter) %>%
  column_to_rownames(var = 'gene_name') %>%
  select(contains_transporter) %>%
  mutate(contains_transporter = as.numeric(contains_transporter))


temp_sum_FL_rowAnno = logTransformed_tpm_for_bgc_genes_minimap2_res %>%
  select(gene_name, contains('FL')) %>%
  mutate(row_sums = rowSums(.[2:ncol(.)])) %>%
  left_join(
    bgc_genes_10kb %>%
      left_join(fl_transporter_summary %>%
                  select(contig, product, contains_transporter),
                by = c('contig', 'product')) %>%
      filter(!is.na(contains_transporter)) %>%
      select(-c(cluster_kb, contig_edge)),
    by = 'gene_name') %>%
  filter(!is.na(contains_transporter)) %>%
  filter(row_sums > 0) %>%
  select(-row_sums) %>%
  group_by(mag_name, contig, region, product, contains_transporter) %>%
  summarize(across(where(is.double), ~ sum(.x)), .groups = 'drop') %>%
  mutate(temp = paste0(contig, ';', product), .before = 1) %>%
  arrange(contains_transporter) %>%
  column_to_rownames(var = 'temp') %>%
  select(contains_transporter) %>%
  mutate(contains_transporter = as.numeric(contains_transporter))
temp_sum_FL_rowAnno
temp_sum_PA_rowAnno = logTransformed_tpm_for_bgc_genes_minimap2_res %>%
  select(gene_name, contains('PA')) %>%
  mutate(row_sums = rowSums(.[2:ncol(.)])) %>%
  left_join(
    bgc_genes_10kb %>%
      left_join(pa_transporter_summary %>%
                  select(contig, product, contains_transporter),
                by = c('contig', 'product')) %>%
      filter(!is.na(contains_transporter)) %>%
      select(-c(cluster_kb, contig_edge)),
    by = 'gene_name') %>%
  filter(!is.na(contains_transporter)) %>%
  filter(row_sums > 0) %>%
  select(-row_sums) %>%
  group_by(mag_name, contig, region, product, contains_transporter) %>%
  summarize(across(where(is.double), ~ sum(.x)), .groups = 'drop') %>%
  mutate(temp = paste0(contig, ';', product), .before = 1) %>%
  arrange(contains_transporter) %>%
  column_to_rownames(var = 'temp') %>%
  select(contains_transporter) %>%
  mutate(contains_transporter = as.numeric(contains_transporter))


Supplementary Figure 5. Biosynthetic transcript expression of MAGs with strict PA and FL fraction preferences

# analysis by water layer -------------------------------------------------

# list of MAGs with FL fraction preference, one list per water layer
fl_fract_prefs_list = list(

  oxycline = allDaResults %>%
    filter(preference == 'free-living enriched', layer == 'oxycline', padj <= 0.01),

  shallow_anoxic = allDaResults %>%
    filter(preference == 'free-living enriched', layer == 'shallow_anoxic', padj <= 0.01),

  euxinic = allDaResults %>%
    filter(preference == 'free-living enriched', layer == 'euxinic', padj <= 0.01)
)

#View(fl_fract_prefs_list)

# list of MAGs with PA fraction preference, one list per water layer
pa_fract_prefs_list = list(

  oxycline = allDaResults %>%
    filter(preference == 'particle enriched', layer == 'oxycline', padj <= 0.01),

  shallow_anoxic = allDaResults %>%
    filter(preference == 'particle enriched', layer == 'shallow_anoxic', padj <= 0.01),

  euxinic = allDaResults %>%
    filter(preference == 'particle enriched', layer == 'euxinic', padj <= 0.01)
)

##
fl_transporter_list = fl_fract_prefs_list %>%
  map(~
  clust_gene_summary %>%
  filter(mag_name %in% .x$mag_name) %>%
  left_join(mag_table %>%
              select(mag_name, phylum),
            by = 'mag_name') %>%
  group_by(contig, product) %>%
  mutate(contains_transporter = if_else(str_detect(description, 'transport') | str_detect(gene_functions, 'transport'), TRUE, FALSE),
         .after = product) %>%
  mutate(contains_transporter = if_else(is.na(contains_transporter), FALSE, contains_transporter)) %>%
  select(mag_name, phylum, contig, product, contains_transporter) %>%
  mutate(temp = if_else(any(contains_transporter), TRUE, FALSE)) %>%
  select(-contains_transporter) %>%
  ungroup() %>%
  rename(contains_transporter = temp) %>%
  distinct()
)

##
pa_transporter_list = pa_fract_prefs_list %>%
  map(~
        clust_gene_summary %>%
        filter(mag_name %in% .x$mag_name) %>%
        left_join(mag_table %>%
                    select(mag_name, phylum),
                  by = 'mag_name') %>%
        group_by(contig, product) %>%
        mutate(contains_transporter = if_else(str_detect(description, 'transport') | str_detect(gene_functions, 'transport'), TRUE, FALSE),
               .after = product) %>%
        mutate(contains_transporter = if_else(is.na(contains_transporter), FALSE, contains_transporter)) %>%
        select(mag_name, phylum, contig, product, contains_transporter) %>%
        mutate(temp = if_else(any(contains_transporter), TRUE, FALSE)) %>%
        select(-contains_transporter) %>%
        ungroup() %>%
        rename(contains_transporter = temp) %>%
        distinct()
  )

# create color specifications for column annotations
anno_colors = list(
  Fraction = c(
    'PA' = 'darkgreen',
    'FL' = 'darkseagreen2'),
  Layer = c(
    'Oxycline' = 'darkslategray3', #'wheat2',
    'Shallow anoxic' = 'gold',# 'khaki2',
    'Euxinic' =  'sienna3'),

  Season = c('May' = 'slategray',
             'November' = 'slategray1'),

  `Depth (m)` = c(
    '103' = 'grey90',
    '148' = 'grey85',
    '198' = 'grey80',
    '200' = 'grey75',
    '234' = 'grey70',
    '237' = 'grey65',
    '247' = 'grey60',
    '267' = 'grey55',
    '295' = 'grey50',
    '314' = 'grey45',
    '900' = 'grey20'
  )
)


fl_prefer_expr_profile_oxy_data =
  logTransformed_tpm_for_bgc_genes_minimap2_res %>%
  select(gene_name, matches(paste0('.*', c(103,148,198,200,234,237), '.*'))) %>%

  mutate(across(2:last_col(), ~ expm1(.x))) %>%
  mutate(row_sums = rowSums(.[2:ncol(.)])) %>%
  left_join(
    bgc_genes_10kb %>%
      left_join(fl_transporter_list$oxycline %>%
                  select(contig, product, contains_transporter),
                by = c('contig', 'product')) %>%
      filter(!is.na(contains_transporter)) %>%
      select(-c(cluster_kb, contig_edge)),
    by = 'gene_name') %>%
  filter(!is.na(contains_transporter)) %>%
  filter(row_sums > 0) %>%
  filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983')) %>%  #
  group_by(mag_name, contig, region, product, contains_transporter) %>%
  summarize(across(where(is.double), ~ sum(.x)), .groups = 'drop') %>%
  mutate(across(where(is.double), ~ log1p(.x))) %>%
  mutate(temp = paste0(contig, ';', product), .before = 1) %>%
  arrange(contains_transporter, row_sums) %>%
  column_to_rownames(var = 'temp') %>%
  select(-c(mag_name, contig, region, product,
            contains_transporter, row_sums)) %>%
  relocate(contains('PA')) %>%
  as.matrix()


  # create column annotation information object
  oxy_fl_col_anno = tibble(
    sample_name = colnames(fl_prefer_expr_profile_oxy_data),
    Depth = str_replace(sample_name, '.*(\\d{3}).*', '\\1'),
    Fraction = if_else(str_detect(sample_name, 'PA'), 'PA', 'FL'),
    Season = if_else(str_detect(sample_name, 'M'), 'May', 'November')
  ) %>%
    mutate(Depth = as.integer(Depth),
           Layer = case_when(
             Depth < 247 ~ 'Oxycline',
             Depth >= 247 & Depth < 900 ~ 'Shallow anoxic',
             TRUE ~ 'Euxinic'
           )) %>%
    mutate(Depth = as.character(Depth)) %>%
    rename(`Depth (m)` = Depth) %>%
    column_to_rownames(var = 'sample_name') %>%
    select(-c(`Depth (m)`, Layer))


  fl_prefer_expr_profile_oxy_plot = fl_prefer_expr_profile_oxy_data %>%
  ComplexHeatmap::pheatmap(
    name = 'Transcripts per million',
    gaps_col = 12,
    cluster_cols = FALSE,
    show_rownames = FALSE,
    annotation_col = oxy_fl_col_anno,
    annotation_colors = anno_colors,

    treeheight_row = 0,
    treeheight_col = 0,
    clustering_method = 'ward.D2',
    color =
      c(colorRampPalette(colors = 'white')(1),
        colorRampPalette(colors = c(
          '#ebf4f7',
          '#aae6fa',
          '#f7e96d',
          '#f7e43b',
          'orange',
          '#ff6a00',
          'firebrick1',
          'darkorchid3',
          'purple4'))(3000)),
    legend_breaks = seq(0, 6, by = 2),
    legend_labels = gt_render(c(
      '0',
      '6.39 x 10^0',
      '5.36 x 10^1',
      '4.02 x 10^2')),
    show_row_dend = FALSE,
    show_column_dend = FALSE
  )


pa_prefer_expr_profile_oxy_data =
  logTransformed_tpm_for_bgc_genes_minimap2_res %>%
  select(gene_name, matches(paste0('.*', c(103,148,198,200,234,237), '.*'))) %>%

  mutate(across(2:last_col(), ~ expm1(.x))) %>%
  mutate(row_sums = rowSums(.[2:ncol(.)])) %>%
  left_join(
    bgc_genes_10kb %>%
      left_join(pa_transporter_list$oxycline %>%
                  select(contig, product, contains_transporter),
                by = c('contig', 'product')) %>%
      filter(!is.na(contains_transporter)) %>%
      select(-c(cluster_kb, contig_edge)),
    by = 'gene_name') %>%
  filter(!is.na(contains_transporter)) %>%
  filter(row_sums > 0) %>%
  filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983')) %>%  #
  group_by(mag_name, contig, region, product, contains_transporter) %>%
  summarize(across(where(is.double), ~ sum(.x)), .groups = 'drop') %>%
  #
  mutate(across(where(is.double), ~ log1p(.x))) %>%
  #
  mutate(temp = paste0(contig, ';', product), .before = 1) %>%
  arrange(contains_transporter, row_sums) %>%
  column_to_rownames(var = 'temp') %>%
  select(-c(mag_name, contig, region, product,
            contains_transporter, row_sums)) %>%
  relocate(contains('PA')) %>%
  as.matrix()


# create column annotation information object
oxy_pa_col_anno = tibble(
  sample_name = colnames(pa_prefer_expr_profile_oxy_data),
  Depth = str_replace(sample_name, '.*(\\d{3}).*', '\\1'),
  Fraction = if_else(str_detect(sample_name, 'PA'), 'PA', 'FL'),
  Season = if_else(str_detect(sample_name, 'M'), 'May', 'November')
) %>%
  mutate(Depth = as.integer(Depth),
         Layer = case_when(
           Depth < 247 ~ 'Oxycline',
           Depth >= 247 & Depth < 900 ~ 'Shallow anoxic',
           TRUE ~ 'Euxinic'
         )) %>%
  mutate(Depth = as.character(Depth)) %>%
  rename(`Depth (m)` = Depth) %>%
  column_to_rownames(var = 'sample_name') %>%
  select(-c(`Depth (m)`, Layer))


pa_prefer_expr_profile_oxy_plot = pa_prefer_expr_profile_oxy_data %>%
  ComplexHeatmap::pheatmap(
    name = 'Transcripts per million',
    gaps_col = 12,
    cluster_cols = FALSE,
    show_rownames = FALSE,
    annotation_col = oxy_pa_col_anno,
    annotation_colors = anno_colors,

    treeheight_row = 0,
    treeheight_col = 0,
    clustering_method = 'ward.D2',
    color =
      c(colorRampPalette(colors = 'white')(1),
        colorRampPalette(colors = c(
          '#ebf4f7',
          '#aae6fa',
          '#f7e96d',
          '#f7e43b',
          'orange',
          '#ff6a00',
          'firebrick1',
          'darkorchid3',
          'purple4'))(3000)
      ),
    legend_breaks = 0:4,
    legend_labels = gt_render(c(
      '0',
      '1.72',
      '6.39',
      '19.09',
      '53.60')),
    show_row_dend = FALSE,
    show_column_dend = FALSE
  )


oxycline_frac_pref_expr_plot = plot_grid(
  NULL,
  grid.grabExpr(draw(
    pa_prefer_expr_profile_oxy_plot,
    merge_legend = TRUE,
    heatmap_legend_side = 'right',
    annotation_legend_side = 'right')),
  NULL,
  grid.grabExpr(draw(
    fl_prefer_expr_profile_oxy_plot,
    merge_legend = TRUE,
    heatmap_legend_side = 'right',
    annotation_legend_side = 'right')),
  nrow = 1,
  rel_widths = c(0.1, 1, 0.1, 1),
  labels = c('PA','', 'FL', '')
)



# bgc expr profiles of fl- and pa- pref mags in -- SHALLOW ANOXIC ---------------
fl_prefer_expr_profile_anox_data =
  logTransformed_tpm_for_bgc_genes_minimap2_res %>%
  select(gene_name, matches(paste0('.*', c(247,267,295,314), '.*'))) %>%

  mutate(across(2:last_col(), ~ expm1(.x))) %>%
  mutate(row_sums = rowSums(.[2:ncol(.)])) %>%
  left_join(bgc_genes_10kb %>%
      left_join(fl_transporter_list$shallow_anoxic %>%
                  select(contig, product, contains_transporter),
                by = c('contig', 'product')) %>%
      filter(!is.na(contains_transporter)) %>%
      select(-c(cluster_kb, contig_edge)),
    by = 'gene_name') %>%
  filter(!is.na(contains_transporter)) %>%
  filter(row_sums > 0) %>%
  filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983')) %>%  #
  group_by(mag_name, contig, region, product, contains_transporter) %>%
  summarize(across(where(is.double), ~ sum(.x)), .groups = 'drop') %>%
  #
  mutate(across(where(is.double), ~ log1p(.x))) %>%
  #
  mutate(temp = paste0(contig, ';', product), .before = 1) %>%
  arrange(contains_transporter, row_sums) %>%
  column_to_rownames(var = 'temp') %>%
  select(-c(mag_name, contig, region, product,
            row_sums, contains_transporter)) %>%
  relocate(colnames(.) %>%
             {function(x) {
               tbl = tibble(
                 mpa = x[str_detect(x, 'MPA')],
                 npa = x[str_detect(x, 'NPA')],
                 mfl = x[str_detect(x, 'MFL')],
                 nfl = x[str_detect(x, 'NFL')]
               ) %>%
                 mutate(groupper = rep(paste0('group_', seq(1, 2, by = 1)), each = 2)) %>%
                 group_by(groupper) %>%
                 summarize(across(everything(), ~
                                    paste0(.x, collapse = ';')),
                           .groups = 'drop') %>%
                 select(-groupper)

               pa = as.vector(rbind(tbl$mpa, tbl$npa))
               fl = as.vector(rbind(tbl$mfl, tbl$nfl))

               g = c(pa, fl)
               g = str_split(g, pattern = ';') %>% unlist()

               return(g)}}()) %>%
  as.matrix()


# create column annotation information object
anox_fl_col_anno = tibble(
  sample_name = colnames(fl_prefer_expr_profile_anox_data),
  Depth = str_replace(sample_name, '.*(\\d{3}).*', '\\1'),
  Fraction = if_else(str_detect(sample_name, 'PA'), 'PA', 'FL'),
  Season = if_else(str_detect(sample_name, 'M'), 'May', 'November')
) %>%
  mutate(Depth = as.integer(Depth),
         Layer = case_when(
           Depth < 247 ~ 'Oxycline',
           Depth >= 247 & Depth < 900 ~ 'Shallow anoxic',
           TRUE ~ 'Euxinic'
         )) %>%
  mutate(Depth = as.character(Depth)) %>%
  rename(`Depth (m)` = Depth) %>%
  column_to_rownames(var = 'sample_name') %>%
  select(-c(`Depth (m)`, Layer))


fl_prefer_expr_profile_anox_plot = fl_prefer_expr_profile_anox_data %>%
  ComplexHeatmap::pheatmap(
    name = 'Transcripts per million',
    gaps_col = 8,
    cluster_cols = FALSE,
    show_rownames = FALSE,
    annotation_col = anox_fl_col_anno,
    annotation_colors = anno_colors,

    treeheight_row = 0,
    treeheight_col = 0,
    clustering_method = 'ward.D2',
    color =
      c(colorRampPalette(colors = 'white')(1),
        colorRampPalette(colors = c(
          '#ebf4f7',
          '#aae6fa',
          '#f7e96d',
          '#f7e43b',
          'orange',
          '#ff6a00',
          'firebrick1',
          'darkorchid3',
          'purple4'))(3000)),
    legend_breaks = seq(0, 6, by = 2),
    legend_labels = gt_render(c(
      '0',
      '6.39 x 10^0',
      '5.36 x 10^1',
      '4.02 x 10^2')),
    show_row_dend = FALSE,
    show_column_dend = FALSE
  )


pa_prefer_expr_profile_anox_data =
  logTransformed_tpm_for_bgc_genes_minimap2_res %>%
  select(gene_name, matches(paste0('.*', c(247,267,295,314), '.*'))) %>%

  mutate(across(2:last_col(), ~ expm1(.x))) %>%
  mutate(row_sums = rowSums(.[2:ncol(.)])) %>%
  left_join(bgc_genes_10kb %>%
      left_join(pa_transporter_list$shallow_anoxic %>%
                  select(contig, product, contains_transporter),
                by = c('contig', 'product')) %>%
      filter(!is.na(contains_transporter)) %>%
      select(-c(cluster_kb, contig_edge)),
    by = 'gene_name') %>%
  filter(!is.na(contains_transporter)) %>%
  filter(row_sums > 0) %>%
  filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983')) %>%  #
  group_by(mag_name, contig, region, product, contains_transporter) %>%
  summarize(across(where(is.double), ~ sum(.x)), .groups = 'drop') %>%
  #
  mutate(across(where(is.double), ~ log1p(.x))) %>%
  #
  mutate(temp = paste0(contig, ';', product), .before = 1) %>%
  arrange(row_sums, contains_transporter) %>%
  column_to_rownames(var = 'temp') %>%
  select(-c(mag_name, contig, region, product,
            row_sums, contains_transporter)) %>%
  relocate(colnames(.) %>%
             {function(x) {
               tbl = tibble(
                 mpa = x[str_detect(x, 'MPA')],
                 npa = x[str_detect(x, 'NPA')],
                 mfl = x[str_detect(x, 'MFL')],
                 nfl = x[str_detect(x, 'NFL')]
               ) %>%
                 mutate(groupper = rep(paste0('group_', seq(1, 2, by = 1)), each = 2)) %>%
                 group_by(groupper) %>%
                 summarize(across(everything(), ~
                                    paste0(.x, collapse = ';')),
                           .groups = 'drop') %>%
                 select(-groupper)

               pa = as.vector(rbind(tbl$mpa, tbl$npa))
               fl = as.vector(rbind(tbl$mfl, tbl$nfl))

               g = c(pa, fl)
               g = str_split(g, pattern = ';') %>% unlist()

               return(g)}}()) %>%
  as.matrix()


# create column annotation information object
anox_pa_col_anno = tibble(
  sample_name = colnames(pa_prefer_expr_profile_anox_data),
  Depth = str_replace(sample_name, '.*(\\d{3}).*', '\\1'),
  Fraction = if_else(str_detect(sample_name, 'PA'), 'PA', 'FL'),
  Season = if_else(str_detect(sample_name, 'M'), 'May', 'November')
) %>%
  mutate(Depth = as.integer(Depth),
         Layer = case_when(
           Depth < 247 ~ 'Oxycline',
           Depth >= 247 & Depth < 900 ~ 'Shallow anoxic',
           TRUE ~ 'Euxinic'
         )) %>%
  mutate(Depth = as.character(Depth)) %>%
  rename(`Depth (m)` = Depth) %>%
  column_to_rownames(var = 'sample_name') %>%
  select(-c(`Depth (m)`, Layer))


pa_prefer_expr_profile_anox_plot = pa_prefer_expr_profile_anox_data %>%
  ComplexHeatmap::pheatmap(
    name = 'Transcripts per million',
    gaps_col = 8,
    cluster_cols = FALSE,
    show_rownames = FALSE,
    annotation_col = anox_pa_col_anno,
    annotation_colors = anno_colors,

    treeheight_row = 0,
    treeheight_col = 0,
    clustering_method = 'ward.D2',
    color =
      c(colorRampPalette(colors = 'white')(1),
        colorRampPalette(colors = c(
          '#ebf4f7',
          '#aae6fa',
          '#f7e96d',
          '#f7e43b',
          'orange',
          '#ff6a00',
          'firebrick1',
          'darkorchid3',
          'purple4'))(3000)
      ),
    legend_breaks = 0:5,
    legend_labels = gt_render(c(
      '0',
      '1.72 x 10^0',
      '6.39 x 10^0',
      '1.91 x 10^1',
      '5.36 x 10^1',
      '1.47 x 10^2')),
    show_row_dend = FALSE,
    show_column_dend = FALSE
  )


anoxic_frac_pref_expr_plot = plot_grid(
  NULL,
  grid.grabExpr(draw(
    pa_prefer_expr_profile_anox_plot,
    merge_legend = TRUE,
    heatmap_legend_side = 'right',
    annotation_legend_side = 'right')),
  NULL,
  grid.grabExpr(draw(
    fl_prefer_expr_profile_anox_plot,
    merge_legend = TRUE,
    heatmap_legend_side = 'right',
    annotation_legend_side = 'right')),
  nrow = 1,
  rel_widths = c(0.1, 1, 0.1, 1),
  labels = c('PA','', 'FL', '')
)


# bgc expr profiles of fl- and pa- pref mags in -- EUXINIC ---------------
fl_prefer_expr_profile_eux_data =
  logTransformed_tpm_for_bgc_genes_minimap2_res %>%
  select(gene_name, matches(paste0('.*', 900, '.*'))) %>%

  mutate(across(2:last_col(), ~ expm1(.x))) %>%
  mutate(row_sums = rowSums(.[2:ncol(.)])) %>%
  left_join(bgc_genes_10kb %>%
              left_join(fl_transporter_list$euxinic %>%
                          select(contig, product, contains_transporter),
                        by = c('contig', 'product')) %>%
              filter(!is.na(contains_transporter)) %>%
              select(-c(cluster_kb, contig_edge)),
            by = 'gene_name') %>%
  filter(!is.na(contains_transporter)) %>%
  filter(row_sums > 0) %>%
  filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983')) %>%  #
  group_by(mag_name, contig, region, product, contains_transporter) %>%
  summarize(across(where(is.double), ~ sum(.x)), .groups = 'drop') %>%
  #
  mutate(across(where(is.double), ~ log1p(.x))) %>%
  #
  mutate(temp = paste0(contig, ';', product), .before = 1) %>%
  arrange(contains_transporter, row_sums) %>%
  column_to_rownames(var = 'temp') %>%
  select(-c(mag_name, contig, region, product,
            row_sums, contains_transporter)) %>%
  relocate(colnames(.) %>%
             {function(x) {
               tbl = tibble(
                 mpa = x[str_detect(x, 'MPA')],
                 npa = x[str_detect(x, 'NPA')],
                 mfl = x[str_detect(x, 'MFL')],
                 nfl = x[str_detect(x, 'NFL')]
               ) %>%
                 mutate(groupper = rep(paste0('group_', seq(1, 1, by = 1)), each = 2)) %>%
                 group_by(groupper) %>%
                 summarize(across(everything(), ~
                                    paste0(.x, collapse = ';')),
                           .groups = 'drop') %>%
                 select(-groupper)

               pa = as.vector(rbind(tbl$mpa, tbl$npa))
               fl = as.vector(rbind(tbl$mfl, tbl$nfl))

               g = c(pa, fl)
               g = str_split(g, pattern = ';') %>% unlist()

               return(g)}}()) %>%
  as.matrix()


# create column annotation information object
eux_fl_col_anno = tibble(
  sample_name = colnames(fl_prefer_expr_profile_eux_data),
  Depth = str_replace(sample_name, '.*(\\d{3}).*', '\\1'),
  Fraction = if_else(str_detect(sample_name, 'PA'), 'PA', 'FL'),
  Season = if_else(str_detect(sample_name, 'M'), 'May', 'November')
) %>%
  mutate(Depth = as.integer(Depth),
         Layer = case_when(
           Depth < 247 ~ 'Oxycline',
           Depth >= 247 & Depth < 900 ~ 'Shallow anoxic',
           TRUE ~ 'Euxinic'
         )) %>%
  mutate(Depth = as.character(Depth)) %>%
  rename(`Depth (m)` = Depth) %>%
  column_to_rownames(var = 'sample_name') %>%
  select(-c(`Depth (m)`, Layer))


fl_prefer_expr_profile_eux_plot = fl_prefer_expr_profile_eux_data %>%
  ComplexHeatmap::pheatmap(
    name = 'Transcripts per million',
    gaps_col = 4,
    cluster_cols = FALSE,
    show_rownames = FALSE,
    annotation_col = eux_fl_col_anno,
    annotation_colors = anno_colors,

    treeheight_row = 0,
    treeheight_col = 0,
    clustering_method = 'ward.D2',
    color =
      c(colorRampPalette(colors = 'white')(1),
        colorRampPalette(colors = c(
          '#ebf4f7',
          '#aae6fa',
          '#f7e96d',
          '#f7e43b',
          'orange',
          '#ff6a00',
          'firebrick1',
          'darkorchid3',
          'purple4'))(3000)),
    legend_breaks = seq(0, 6, by = 2),
    legend_labels = gt_render(c(
      '0',
      '6.39 x 10^0',
      '5.36 x 10^1',
      '4.02 x 10^2')),
    show_row_dend = FALSE,
    show_column_dend = FALSE
  )


pa_prefer_expr_profile_eux_data =
  logTransformed_tpm_for_bgc_genes_minimap2_res %>%
  select(gene_name, matches(paste0('.*', 900, '.*'))) %>%

  mutate(across(2:last_col(), ~ expm1(.x))) %>%
  mutate(row_sums = rowSums(.[2:ncol(.)])) %>%
  left_join(bgc_genes_10kb %>%
              left_join(pa_transporter_list$euxinic %>%
                          select(contig, product, contains_transporter),
                        by = c('contig', 'product')) %>%
              filter(!is.na(contains_transporter)) %>%
              select(-c(cluster_kb, contig_edge)),
            by = 'gene_name') %>%
  filter(!is.na(contains_transporter)) %>%
  filter(row_sums > 0) %>%
  filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983')) %>%  #
  group_by(mag_name, contig, region, product, contains_transporter) %>%
  summarize(across(where(is.double), ~ sum(.x)), .groups = 'drop') %>%
  #
  mutate(across(where(is.double), ~ log1p(.x))) %>%
  #
  mutate(temp = paste0(contig, ';', product), .before = 1) %>%
  arrange(row_sums, contains_transporter) %>%
  column_to_rownames(var = 'temp') %>%
  select(-c(mag_name, contig, region, product,
            row_sums, contains_transporter)) %>%
  relocate(colnames(.) %>%
             {function(x) {
               tbl = tibble(
                 mpa = x[str_detect(x, 'MPA')],
                 npa = x[str_detect(x, 'NPA')],
                 mfl = x[str_detect(x, 'MFL')],
                 nfl = x[str_detect(x, 'NFL')]
               ) %>%
                 mutate(groupper = rep(paste0('group_', seq(1, 1, by = 1)), each = 2)) %>%
                 group_by(groupper) %>%
                 summarize(across(everything(), ~
                                    paste0(.x, collapse = ';')),
                           .groups = 'drop') %>%
                 select(-groupper)

               pa = as.vector(rbind(tbl$mpa, tbl$npa))
               fl = as.vector(rbind(tbl$mfl, tbl$nfl))

               g = c(pa, fl)
               g = str_split(g, pattern = ';') %>% unlist()

               return(g)}}()) %>%
  as.matrix()


# create column annotation information object
eux_pa_col_anno = tibble(
  sample_name = colnames(pa_prefer_expr_profile_eux_data),
  Depth = str_replace(sample_name, '.*(\\d{3}).*', '\\1'),
  Fraction = if_else(str_detect(sample_name, 'PA'), 'PA', 'FL'),
  Season = if_else(str_detect(sample_name, 'M'), 'May', 'November')
) %>%
  mutate(Depth = as.integer(Depth),
         Layer = case_when(
           Depth < 247 ~ 'Oxycline',
           Depth >= 247 & Depth < 900 ~ 'Shallow anoxic',
           TRUE ~ 'Euxinic'
         )) %>%
  mutate(Depth = as.character(Depth)) %>%
  rename(`Depth (m)` = Depth) %>%
  column_to_rownames(var = 'sample_name') %>%
  select(-c(`Depth (m)`, Layer))


pa_prefer_expr_profile_eux_plot = pa_prefer_expr_profile_eux_data %>%
  ComplexHeatmap::pheatmap(
    name = 'Transcripts per million',
    gaps_col = 8,
    cluster_cols = FALSE,
    show_rownames = FALSE,
    annotation_col = eux_pa_col_anno,
    annotation_colors = anno_colors,

    treeheight_row = 0,
    treeheight_col = 0,
    clustering_method = 'ward.D2',
    color =
      c(colorRampPalette(colors = 'white')(1),
        colorRampPalette(colors = c(
          '#ebf4f7',
          '#aae6fa',
          '#f7e96d',
          '#f7e43b',
          'orange',
          '#ff6a00',
          'firebrick1',
          'darkorchid3',
          'purple4'))(3000)
      ),
    legend_breaks = 0:3,
    legend_labels = gt_render(c(
      '0',
      '1.72 x 10^0',
      '6.39 x 10^0',
      '1.91 x 10^1')),
    show_row_dend = FALSE,
    show_column_dend = FALSE
  )


euxinic_frac_pref_expr_plot = plot_grid(
  NULL,
  grid.grabExpr(draw(
    pa_prefer_expr_profile_eux_plot,
    merge_legend = TRUE,
    heatmap_legend_side = 'right',
    annotation_legend_side = 'right')),
  NULL,
  grid.grabExpr(draw(
    fl_prefer_expr_profile_eux_plot,
    merge_legend = TRUE,
    heatmap_legend_side = 'right',
    annotation_legend_side = 'right')),
  nrow = 1,
  rel_widths = c(0.1, 1, 0.1, 1),
  labels = c('PA','', 'FL', '')
)



all_bgc_expr_frac_pref_plots = plot_grid(
  NULL,
  oxycline_frac_pref_expr_plot,
  NULL,
  anoxic_frac_pref_expr_plot,
  NULL,
  euxinic_frac_pref_expr_plot,
  ncol = 2,
  nrow = 3,
  labels = c('a', '', 'b', '', 'c', ''),
  rel_widths = c(0.05, 1)
)
all_bgc_expr_frac_pref_plots


Supplementary Figure 6. Expression of biosynthetic transcripts from gene clusters

annotated as ladderanes

logTransformed_tpm_for_bgc_genes_minimap2_res = read_tsv('~/Documents/cariaco-tidy-analysis/rds_to_flatfile/flatfiles/logTransformed_tpm_for_bgc_genes_minimap2_res.tsv.gz')
## Rows: 23843 Columns: 49
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (1): gene_name
## dbl (48): MFL103_1, MFL103_2, MFL198_1, MFL198_2, MFL234_1, MFL234_2, MFL295...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ladderane_colAnno = read_tsv('~/Documents/cariaco-tidy-analysis/rds_to_flatfile/flatfiles//ladderane_colAnno.tsv.gz')
## Rows: 48 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): Fraction, Season, Layer
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
allDaResults_df = allDaResults %>%
  map_dfr(~ .x) %>%
  group_by(mag_name) %>%
  mutate(strict_fraction_preference = case_when(
    all(preference == 'free-living enriched') ~ 'FL',
    all(preference == 'particle enriched') ~ 'PA',
    TRUE ~ 'No strict preference')) %>%
  ungroup()

ladderane_rowAnno = logTransformed_tpm_for_bgc_genes_minimap2_res %>%
  left_join(
    bgc_genes_10kb %>%
      select(gene_name, product, contig),
    by = 'gene_name') %>%
  relocate(c(product, contig), .after = 1) %>%
  filter(product == 'ladderane') %>%
  mutate(row_sums = rowSums(.[4:ncol(.)]), .after = contig) %>%
  filter(row_sums > 0) %>%
  select(-c(gene_name, row_sums)) %>%
  group_by(contig, product) %>%
  summarize(across(everything(), ~ sum(.x)), .groups = 'drop') %>%
  mutate(temp = paste0(contig, ';', product),
         mag_name = str_replace(contig, '(MAG_\\d+)_.*', '\\1')) %>%
  select(temp, mag_name) %>%
  left_join(
    mag_table %>%
      select(mag_name, phylum),
    by = 'mag_name') %>%
  left_join(
    allDaResults_df %>%
      select(mag_name, strict_fraction_preference) %>%
      distinct(),
    by = 'mag_name') %>%
  select(-mag_name) %>%
  column_to_rownames(var = 'temp') %>%
  rename(
    Phylum = phylum,
    `Fraction preference` = strict_fraction_preference)


ladderane_relPalette <- list(

  Layer = c(
    'Oxycline' = 'darkslategray3', #'wheat2',
    'Shallow anoxic' = 'gold',# 'khaki2',
    'Euxinic' =  'sienna3'),

  Fraction = c('PA' = 'darkgreen',
               'FL' = 'darkseagreen2'),

  Season = c('May' = 'slategray',
             'November' = 'slategray1'),

  `Depth (m)` = c(
    '103' = 'grey90',
    '148' = 'grey85',
    '198' = 'grey80',
    '200' = 'grey75',
    '234' = 'grey70',
    '237' = 'grey65',
    '247' = 'grey60',
    '267' = 'grey55',
    '295' = 'grey50',
    '314' = 'grey45',
    '900' = 'grey20'),

  Phylum = c('Planctomycetota' = 'khaki',
             'Desulfobacterota' = '#666666',
             'Myxococcota' = 'orange',
             'Fibrobacterota' = 'orchid1',
             'GCA-001730085' = 'slateblue1',
             'Latescibacterota' = 'aquamarine',
             'Omnitrophota' = 'darkviolet',
             'Verrucomicrobiota' = 'gainsboro',
             'UBP3' = 'deepskyblue'),

  `Fraction preference` = c('FL' = 'grey70',
                            'PA' = 'black',
                            'No strict preference' = 'white'))

ladderane_heat_data =
  logTransformed_tpm_for_bgc_genes_minimap2_res %>%
  left_join(
    bgc_genes_10kb %>%
      select(gene_name, product, contig),
    by = 'gene_name') %>%
  relocate(c(product, contig), .after = 1) %>%
  filter(product == 'ladderane') %>%
  mutate(row_sums = rowSums(.[4:ncol(.)]), .after = contig) %>%
  filter(row_sums > 0) %>%
  select(-c(gene_name, row_sums)) %>%
  mutate(across(where(is.numeric), ~ expm1(.x))) %>%
  group_by(contig, product) %>%
  summarize(across(everything(), ~ sum(.x)), .groups = 'drop') %>%
  mutate(across(where(is.numeric), ~ log1p(.x))) %>%
  mutate(temp = paste0(contig, ';', product)) %>%
  mutate(mag_name = str_replace(contig, '(MAG_\\d+)_.*', '\\1')) %>%
  left_join(
    mag_table %>%
      select(mag_name, phylum),
    by = 'mag_name') %>%
  group_by(phylum) %>%
  mutate(n = n()) %>%
  ungroup() %>%
  column_to_rownames(var = 'temp') %>%
  arrange(desc(n), phylum) %>%
  select(-c(contig, product, phylum, mag_name, n)) %>%
  relocate(colnames(.) %>%
    {function(x) {
      tbl = tibble(
        mpa = x[str_detect(x, 'MPA')],
        npa = x[str_detect(x, 'NPA')],
        mfl = x[str_detect(x, 'MFL')],
        nfl = x[str_detect(x, 'NFL')]
      ) %>%
        mutate(groupper = rep(paste0('group_', seq(1, 6, by = 1)), each = 2)) %>%
        group_by(groupper) %>%
        summarize(across(everything(), ~
                           paste0(.x, collapse = ';')),
                  .groups = 'drop') %>%
        select(-groupper)

      pa = as.vector(rbind(tbl$mpa, tbl$npa))
      fl = as.vector(rbind(tbl$mfl, tbl$nfl))

      g = c(fl, pa)
      g = str_split(g, pattern = ';') %>% unlist()

      return(g)}}()
    ) %>%
  as.matrix()

ladderane_colAnno = ladderane_colAnno %>%
  rownames_to_column(var = 'sample') %>%
  as_tibble() %>%
  arrange(factor(sample, levels = colnames(ladderane_heat_data))) %>%
  column_to_rownames(var = 'sample')


ladderane_rowAnno = ladderane_rowAnno %>%
  rownames_to_column(var = 'label') %>%
  as_tibble() %>%
  arrange(factor(label, levels = rownames(ladderane_heat_data))) %>%
  column_to_rownames(var = 'label')


ladderane_heat_plot = ladderane_heat_data %>%
  ComplexHeatmap::pheatmap(
    name = 'Transcripts per million',
    legend_breaks = 0:4,
    legend_labels = gt_render(c(
      '0',
      '1.72 x 10^0',
      '6.40 x 10^0',
      '1.91 x 10^1',
      '5.36 x 10^1')),
    fontsize = 8,
    cluster_cols = FALSE,
    cluster_rows = FALSE,
    show_rownames = FALSE,
    annotation_row = ladderane_rowAnno,
    annotation_col = ladderane_colAnno,
    annotation_colors = ladderane_relPalette,
    treeheight_row = 0,
    treeheight_col = 0,
    clustering_method = 'ward.D2',
    color =
      c(colorRampPalette(colors = 'white')(1),
        colorRampPalette(colors = c(
          #'white',
          #'gray100',
          #'gray100',
          #'gray99',
          #'gray98',
          '#ebf4f7',
          '#aae6fa',
          #'#c9e6f0',
          '#f7e96d',
          #'yellow',
          '#f7e43b',
          #'goldenrod1',
          'orange',
          '#ff6a00',
          'firebrick1',
          #'#D73027',
          'darkorchid3',
          #'purple3',
          'purple4'))(3000)))


ladderane_heat_plot_final = ComplexHeatmap::draw(
  ladderane_heat_plot,
  merge_legend = TRUE,
  heatmap_legend_side = 'right'
  #annotation_legend_side = 'right'
  )


Command line script to extract biosynthetic gene cluster predictions from the Cariaco MAGs using the antiSMASH 6.0 command line tool

#!/bin/bash
#SBATCH --partition=computer # Queue selection
#SBATCH --job-name=antismash6_array # Job name
#SBATCH --mail-type=ALL # Mail events (BEGIN, END, FAIL, ALL)
#SBATCH --mail-user=your_email_address@gmail.com # Where to send mail
#SBATCH --ntasks=36 # Run on 36 CPUs
#SBATCH --nodes=1 # one node
#SBATCH --cpus-per-task=1 # one CPU per parallel task
#SBATCH --mem=180gb # Job memory request
#SBATCH --time=1-00:00:00 # Time limit hrs:min:sec
#SBATCH --output=logs/final_antismash_array_%A-%a.log# Standard output/error
#SBATCH --array= # fill this in with numbers uniquely affiliated with each of your MAGs
#SBATCH --qos=unlim
#
source activate anti6 # antismash 6.0 conda environment
GENOME_DIR=$(echo "path/to/intput/genome_fasta_files")
GBK_DIR=$(echo "path/to/output/genome_gene_prediction_gbk_files")
GFF_DIR=$(echo "path/to/output/genome/genome_gene_prediction_gff_files/files")
FASTA_DIR=$(echo "path/to/output/genome_gene_prediction_fasta_files")
OUT_DIR=$(echo "path/to/antismash/output/dir")

source activate gbk
cd "$GBK_DIR"

for FASTA in "$GENOME_DIR"/MAG_"$SLURM_ARRAY_TASK_ID".fa
do MAG=$(echo $FASTA | sed -r 's/.*(MAG_.*).fa/\1/')

#create gene predictions in FASTA format - used for rnaseq read alignment later
prodigal \
-i $FASTA \
-q \
-d "$FASTA_DIR"/"$MAG"-genes.fa

#create gene predictions in GFF format - to create GBK file antismash input
prodigal \
-i $FASTA \
-q \
-f gff \
-o "$GFF_DIR"/"$MAG".gff

#create GBK file that will be used as antismash input -- using gff_to_genbank.py script
python path/to/gff_to_genbank.py \
"$GFF_DIR"/"$MAG".gff $FASTA

mv "$GFF_DIR"/"$MAG".gb "$MAG".gbk

conda deactivate # deactivate conda env
source activate anti6 # activate antismash 6 conda env

antismash "$GBK_DIR"/"$MAG".gbk \
--output-dir "$OUT_DIR"/"$MAG" \
-c 36 \
--clusterhmmer \
--tigrfam \
--smcog-trees \
--cb-general \
--cb-subclusters \
--pfam2go \
--rre \
--cc-mibig \
--asf \
--allow-long-headers \
--genefinding-tool none \
--output-basename "$MAG"

done


Command line script to map metagenomic reads against Cariaco MAGs

#!/bin/bash
#SBATCH --partition=compute # Queue selection
#SBATCH --job-name=minimap2_metaG # Job name
#SBATCH --mail-type=ALL # Mail events (BEGIN, END, FAIL, ALL)
#SBATCH --mail-user=your_email_address # Where to send mail
#SBATCH --ntasks=36 # Run on 36 CPUs
#SBATCH --nodes=1
#SBATCH --cpus-per-task=1
#SBATCH --mem=180gb # Job memory request
#SBATCH --time=1-00:00:00 # Time limit hrs:min:sec
#SBATCH --output=logs/minimap2_metaG_mapping_over_cariaco_genes_array_%A-%a.log# Standard output/error
#SBATCH --array=0-47
#SBATCH --qos=unlim

source activate minimap2

cd /vortexfs1/omics/pachiadaki/Cariaco-data/Metagenomes

FILE_PATTERN=("D1b900AM" "D1b900BM" "D2a143A" "D2a143B" "D2a200A" "D2a200B" "D2a237A" "D2a237B" "D2a247A" "D2a247B" "D2a267A" "D2a267B" "D2a900AN" "D2a900BN" "D2b143A" "D2b143B" "D2b200A" "D2b200B" "D2b237A" "D2b237B" "D2b247A" "D2b247B" "D2b267A" "D2b267B" "D2b900AN" "D2b900BN" "D3a103A" "D3a103B" "D3a198A" "D3a198B" "D3a234A" "D3a234B" "D3a295A" "D3a295B" "D3a314A" "D3a314B" "D3a900AM" "D3a900BM" "D3b103A" "D3b103B" "D3b198A" "D3b198B" "D3b234A" "D3b234B" "D3b295A" "D3b295B" "D3b314A" "D3b314B")

FILE_PATTERN=${FILE_PATTERN[$SLURM_ARRAY_TASK_ID]}

REF_DIR=$(echo "/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/FINAL_MAY_2022_ALL_ANTISMASH6_CONCAT_GENE_FASTA_INDEX")

OUT_DIR=$(echo "/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/minimap2_metaG_mapping_final_results_used")

minimap2 -t 36 -ax sr "$REF_DIR"/cariaco_minimap2_ref.mmi "$FILE_PATTERN"_R1_paired.fastq.gz "$FILE_PATTERN"_R2_paired.fastq.gz > "$OUT_DIR"/"$FILE_PATTERN".sam

cd "$OUT_DIR"

paftools.js sam2paf "$FILE_PATTERN".sam > "$FILE_PATTERN".paf


Command line script to map metatranscriptomic reads against Cariaco MAGs

#!/bin/bash
#SBATCH --partition=compute # Queue selection
#SBATCH --job-name=minimap2_metaT # Job name
#SBATCH --mail-type=ALL # Mail events (BEGIN, END, FAIL, ALL)
#SBATCH --mail-user=your_email_address # Where to send mail
#SBATCH --ntasks=36 # Run on 36 CPUs
#SBATCH --nodes=1
#SBATCH --cpus-per-task=1
#SBATCH --mem=180gb # Job memory request
#SBATCH --time=1-00:00:00 # Time limit hrs:min:sec
#SBATCH --output=logs/minimap2_metaT_mapping_over_cariaco_genes_array_%A-%a.log# Standard output/error
#SBATCH --array=0-47
#SBATCH --qos=unlim

source activate minimap2

cd /vortexfs1/omics/pachiadaki/dgellermcgrath/all-files-from-scratch-2-8-2021/cariaco_metatranscriptomes

FILE_PATTERN=("MFL103_1" "MFL103_2" "MFL198_1" "MFL198_2" "MFL234_1" "MFL234_2" "MFL295_1" "MFL295_2" "MFL314_1" "MFL314_2" "MFL900_1" "MFL900_2" "MPA103_1" "MPA103_2" "MPA198_1" "MPA198_2" "MPA234_1" "MPA234_2" "MPA295_1" "MPA295_2" "MPA314_1" "MPA314_2" "MPA900_1" "MPA900_2" "NFL148_1" "NFL148_2" "NFL200_1" "NFL200_2" "NFL237_1" "NFL237_2" "NFL247_1" "NFL247_2" "NFL267_1" "NFL267_2" "NFL900_1" "NFL900_2" "NPA148_1" "NPA148_2" "NPA200_1" "NPA200_2" "NPA237_1" "NPA237_2" "NPA247_1" "NPA247_2" "NPA267_1" "NPA267_2" "NPA900_1" "NPA900_2")

FILE_PATTERN=${FILE_PATTERN[$SLURM_ARRAY_TASK_ID]}

minimap2 -t 36 -ax sr /vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/FINAL_MAY_2022_ALL_ANTISMASH6_CONCAT_GENE_FASTA_INDEX/cariaco_minimap2_ref.mmi "$FILE_PATTERN"_R1_paired.fastq.gz "$FILE_PATTERN"_R2_paired.fastq.gz > /vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/minimap2_metaT_mapping_final_results_used/"$FILE_PATTERN".sam

cd /vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/minimap2_metaT_mapping_final_results_used/

paftools.js sam2paf "$FILE_PATTERN".sam > "$FILE_PATTERN".paf


Command line script to interactively run BiG-SCAPE on a HPC compute node on BGCs > 10kb in length

conda activate bigscape # conda env to run BiG-SCAPE

INPUT_GBK_DIR=$(echo "path/to/antismash6/BGC/gbk/files")

PFAM_DIR=$(echo "path/to/downloaded/pfam/directory")

python bigscape.py \
-i "$INPUT_GBK_DIR" \
-o cariaco_bigscape_results_FINAL \
-c 36 \
--pfam_dir "$PFAM_DIR" \
--mibig \
--clan_cutoff 0.3 0.7


Command line script to map reads to whole genomes, obtain counts for use as input for DESeq2 differential abundance analaysis

#!/bin/bash
#SBATCH --partition=compute # Queue selection
#SBATCH --job-name=coverM_genomeCounts # Job name
#SBATCH --mail-type=ALL # Mail events (BEGIN, END, FAIL, ALL)
#SBATCH --mail-user=your_email_address # Where to send mail
#SBATCH --ntasks=36 # Run on one CPU
#SBATCH --nodes=1
#SBATCH --cpus-per-task=1
#SBATCH --mem=180gb # Job memory request
#SBATCH --time=1-00:00:00 # Time limit hrs:min:sec
#SBATCH --output=logs/coverm_mag_counts_of_reads_mapped_array_%A-%a.log# Standard output/error
#SBATCH --array=0-47
#SBATCH --qos=unlim

source activate coverm

cd /vortexfs1/omics/pachiadaki/Cariaco-data/Metagenomes

GENOME_DIR=$(echo "path/to/genome/fasta/dir/")

OUT_DIR=$(echo "path/to/desired/output/dir")

FILE_PATTERN=("D1b900AM" "D1b900BM" "D2a143A" "D2a143B" "D2a200A" "D2a200B" "D2a237A" "D2a237B" "D2a247A" "D2a247B" "D2a267A" "D2a267B" "D2a900AN" "D2a900BN" "D2b143A" "D2b143B" "D2b200A" "D2b200B" "D2b237A" "D2b237B" "D2b247A" "D2b247B" "D2b267A" "D2b267B" "D2b900AN" "D2b900BN" "D3a103A" "D3a103B" "D3a198A" "D3a198B" "D3a234A" "D3a234B" "D3a295A" "D3a295B" "D3a314A" "D3a314B" "D3a900AM" "D3a900BM" "D3b103A" "D3b103B" "D3b198A" "D3b198B" "D3b234A" "D3b234B" "D3b295A" "D3b295B" "D3b314A" "D3b314B")

FILE_PATTERN=${FILE_PATTERN[$SLURM_ARRAY_TASK_ID]}

coverm genome \
--coupled "$FILE_PATTERN"_R1_paired.fastq.gz "$FILE_PATTERN"_R2_paired.fastq.gz \
--genome-fasta-directory "$GENOME_DIR" \
--genome-fasta-extension fa \
--mapper bwa-mem \
--methods count \
--min-covered-fraction 0 \
--min-read-percent-identity 95 \
--min-read-aligned-percent 50 \
--exclude-supplementary \
--threads 36 \
-o "$OUT_DIR"/"$FILE_PATTERN"_counts_reads_mapped_per_genome.tsv


Command line script to map reads to whole genomes, obtain relative abundance information for Cariaco MAGS in each metagenomic sample

#!/bin/bash
#SBATCH --partition=compute # Queue selection
#SBATCH --job-name=coverM_relAbun # Job name
#SBATCH --mail-type=ALL # Mail events (BEGIN, END, FAIL, ALL)
#SBATCH --mail-user=your_email_address # Where to send mail
#SBATCH --ntasks=36 # Run on one CPU
#SBATCH --nodes=1
#SBATCH --cpus-per-task=1
#SBATCH --mem=180gb # Job memory request
#SBATCH --time=1-00:00:00 # Time limit hrs:min:sec
#SBATCH --output=logs/coverm_mag_counts_of_reads_mapped_array_%A-%a.log# Standard output/error
#SBATCH --array=0-47
#SBATCH --qos=unlim

source activate coverm

cd /vortexfs1/omics/pachiadaki/Cariaco-data/Metagenomes

GENOME_DIR=$(echo "path/to/genome/fasta/dir/")

OUT_DIR=$(echo "path/to/desired/output/dir")

FILE_PATTERN=("D1b900AM" "D1b900BM" "D2a143A" "D2a143B" "D2a200A" "D2a200B" "D2a237A" "D2a237B" "D2a247A" "D2a247B" "D2a267A" "D2a267B" "D2a900AN" "D2a900BN" "D2b143A" "D2b143B" "D2b200A" "D2b200B" "D2b237A" "D2b237B" "D2b247A" "D2b247B" "D2b267A" "D2b267B" "D2b900AN" "D2b900BN" "D3a103A" "D3a103B" "D3a198A" "D3a198B" "D3a234A" "D3a234B" "D3a295A" "D3a295B" "D3a314A" "D3a314B" "D3a900AM" "D3a900BM" "D3b103A" "D3b103B" "D3b198A" "D3b198B" "D3b234A" "D3b234B" "D3b295A" "D3b295B" "D3b314A" "D3b314B")

FILE_PATTERN=${FILE_PATTERN[$SLURM_ARRAY_TASK_ID]}

coverm genome \
--coupled "$FILE_PATTERN"_R1_paired.fastq.gz "$FILE_PATTERN"_R2_paired.fastq.gz \
--genome-fasta-directory "$GENOME_DIR" \
--genome-fasta-extension fa \
--mapper bwa-mem \
--methods relative_abundance \
--min-read-percent-identity 95 \
--min-read-aligned-percent 50 \
--exclude-supplementary \
--threads 36 \
-o "$OUT_DIR"/"$FILE_PATTERN"_minReadAlignedPercent95.tsv